linux内核之accept实现 / 憋错料

用户态对accept的标准用法：
if ((client_fd = accept(sockfd, (struct sockaddr *)&remote_addr, &sin_size)) == -1)
{
//accept()函数让服务器接收客户的连接请求
perror("accept Error\n");
continue;
}
sockfd是通过socket系统调用，并且经过listen过的套接字：
sockfd = socket(AF_INET, SOCK_STREAM, 0)
listen(sockfd, 128)
remote_addr将会存储远端设备的地址信息。


SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen)
{
    return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
}


SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen, int, flags)
{
    struct socket *sock, *newsock;
    struct file *newfile;
    int err, len, newfd, fput_needed;
    struct sockaddr_storage address;

    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
    {
        return -EINVAL;
    }

    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
    {
        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
    }

    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (!sock)
    {
        goto out;
    }

    err = -ENFILE;
    newsock = sock_alloc(); /*! 1.创建新的sock给新的连接 */
    if (!newsock)
    {
        goto out_put;
    }

    newsock->type = sock->type;
    newsock->ops = sock->ops;

    /*
     * We don‘t need try_module_get here, as the listening socket (sock)
     * has the protocol module (sock->ops->owner) held.
     */
    __module_get(newsock->ops->owner);

    newfd = get_unused_fd_flags(flags); /*! 2.分配一个fd给新的连接 */
    if (unlikely(newfd < 0))
    {
        err = newfd;
        sock_release(newsock);
        goto out_put;
    }
    newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); /*! 3.为newsock创建一个对应的file结构 */
    if (unlikely(IS_ERR(newfile)))
    {
        err = PTR_ERR(newfile);
        put_unused_fd(newfd);
        sock_release(newsock);
        goto out_put;
    }

    err = security_socket_accept(sock, newsock);
    if (err)
    {
        goto out_fd;
    }

    err = sock->ops->accept(sock, newsock, sock->file->f_flags); /*! 4.调用Socket层操作函数inet_accept() */
    if (err < 0)
    {
        goto out_fd;
    }

    if (upeer_sockaddr)
    {
        if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
                                  &len, 2) < 0)
        {
            err = -ECONNABORTED;
            goto out_fd;
        }
        err = move_addr_to_user(&address,
                                len, upeer_sockaddr, upeer_addrlen);
        if (err < 0)
        {
            goto out_fd;
        }
    }

    /* File flags are not inherited via accept() unlike another OSes. */

    fd_install(newfd, newfile);
    err = newfd;

out_put:
    fput_light(sock->file, fput_needed);
out:
    return err;
out_fd:
    fput(newfile);
    put_unused_fd(newfd);
    goto out_put;
}

3、sock_alloc_file()


struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
    struct qstr name = { .name = "" };
    struct path path;
    struct file *file;

    if (dname)
    {
        name.name = dname;
        name.len = strlen(name.name);
    }
    else if (sock->sk)
    {
        name.name = sock->sk->sk_prot_creator->name;
        name.len = strlen(name.name);
    }
    path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
    if (unlikely(!path.dentry))
    {
        return ERR_PTR(-ENOMEM);
    }
    path.mnt = mntget(sock_mnt);

    d_instantiate(path.dentry, SOCK_INODE(sock));

    file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
                      &socket_file_ops);
    if (unlikely(IS_ERR(file)))
    {
        /* drop dentry, keep inode */
        ihold(path.dentry->d_inode);
        path_put(&path);
        return file;
    }

    /*! 注意这里的属性设置 */
    sock->file = file;
    file->f_flags = O_RDWR | (flags & O_NONBLOCK);
    file->private_data = sock;
    return file;
}

4、inet_accept()


/*
 *	Accept a pending connection. The TCP layer now gives BSD semantics.
 */
// <net/ipv4/af_inet.c>
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
    struct sock *sk1 = sock->sk;
    int err = -EINVAL;

    /**
     * 如果使用的是TCP，则sk_prot为tcp_prot，accept为inet_csk_accept()
     * 获取新连接的sock。
     */
    struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); /*! 4.1.获取新连接的sock */

    if (!sk2)
    {
        goto do_err;
    }

    lock_sock(sk2);

    sock_rps_record_flow(sk2);
    WARN_ON(!((1 << sk2->sk_state) &
              (TCPF_ESTABLISHED | TCPF_SYN_RECV |
               TCPF_CLOSE_WAIT | TCPF_CLOSE)));

    sock_graft(sk2, newsock); /*! 4.2.把sock和socket嫁接起来，让它们能相互索引 */

    newsock->state = SS_CONNECTED; /*! 4.3.把新socket的状态设为已连接 */
    err = 0;
    release_sock(sk2);
do_err:
    return err;
}

4.2、sock_graft()


// <net/Sock.h>
static inline void sock_graft(struct sock *sk, struct socket *parent)
{
    write_lock_bh(&sk->sk_callback_lock);
    sk->sk_wq = parent->wq;
    parent->sk = sk; /*! INET层的socket使用下层的sock服务 */
    sk_set_socket(sk, parent);
    security_sock_graft(sk, parent);
    write_unlock_bh(&sk->sk_callback_lock);
}

4.1、inet_csk_accept()

/**

* inet_csk_accept()用于从backlog队列（全连接队列）中取出一个ESTABLISHED状态的连接请求块，返回它所对应的连接sock。

* 1. 非阻塞的，且当前没有已建立的连接，则直接退出，返回-EAGAIN。

* 2. 阻塞的，且当前没有已建立的连接：

* 2.1 用户没有设置超时时间，则无限期阻塞。

* 2.2 用户设置了超时时间，超时后会退出。


// <net/ipv4/Inet_connection_sock.c>
/*
 * This will accept the next outstanding connection.
 */
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    struct request_sock_queue *queue = &icsk->icsk_accept_queue;
    struct sock *newsk;
    struct request_sock *req;
    int error;

    lock_sock(sk);

    /* We need to make sure that this socket is listening,
     * and that it has something pending.
     */
    error = -EINVAL;
    if (sk->sk_state != TCP_LISTEN)
    {
        goto out_err;
    }

    /* Find already established connection */
    if (reqsk_queue_empty(queue)) // 没有ESTABLISHED状态的连接请求块
    {
        long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);

        /* If this is a non blocking socket don‘t sleep */
        error = -EAGAIN;
        if (!timeo)
        {
            goto out_err;
        }

        /*! 4.1.1 阻塞等待，直到有全连接。如果用户设置有等待时间，超时后会退出 */
        error = inet_csk_wait_for_connect(sk, timeo);
        if (error)
        {
            goto out_err;
        }
    }

    /*! 从全连接队列中取出第一个established状态的连接请求块 */
    req = reqsk_queue_remove(queue);
    newsk = req->sk;

    sk_acceptq_removed(sk);
    if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL)
    {
        spin_lock_bh(&queue->fastopenq->lock);
        if (tcp_rsk(req)->listener)
        {
            /* We are still waiting for the final ACK from 3WHS
             * so can‘t free req now. Instead, we set req->sk to
             * NULL to signify that the child socket is taken
             * so reqsk_fastopen_remove() will free the req
             * when 3WHS finishes (or is aborted).
             */
            req->sk = NULL;
            req = NULL;
        }
        spin_unlock_bh(&queue->fastopenq->lock);
    }
out:
    release_sock(sk);
    if (req)
    {
        __reqsk_free(req);
    }
    return newsk;
out_err:
    newsk = NULL;
    req = NULL;
    *err = error;
    goto out;
}

4.1.1 inet_csk_wait_for_connect()


// <net/ipv4/Inet_connection_sock.c>
/*
 * Wait for an incoming connection, avoid race conditions. This must be called
 * with the socket locked.
 */
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    DEFINE_WAIT(wait);
    int err;

    /*
     * True wake-one mechanism for incoming connections: only
     * one process gets woken up, not the ‘whole herd‘.
     * Since we do not ‘race & poll‘ for established sockets
     * anymore, the common case will execute the loop only once.
     *
     * Subtle issue: "add_wait_queue_exclusive()" will be added
     * after any current non-exclusive waiters, and we know that
     * it will always _stay_ after any new non-exclusive waiters
     * because all non-exclusive waiters are added at the
     * beginning of the wait-queue. As such, it‘s ok to "drop"
     * our exclusiveness temporarily when we get woken up without
     * having to remove and re-insert us on the wait queue.
     */
    for (;;)
    {
    	/*! 把自己加入到等待队列，并且设置自己的状态是可中断的 */
        prepare_to_wait_exclusive(sk_sleep(sk), &wait,
                                  TASK_INTERRUPTIBLE);
        release_sock(sk);
        if (reqsk_queue_empty(&icsk->icsk_accept_queue))
        {
        	/**  
        	 * 用户发起的accept操作就停schedule_timeout中
        	 * switch (timeout)
             * { 
             *    case MAX_SCHEDULE_TIMEOUT:
             *         schedule();
             *         goto out;
             *    default:
             * }
        	 * 根据其实现代码，由于我们一般没有设置timeout值，所以是MAX_SCHEDULE_TIMEOUT的情况，这表示立即进入重新调度，
        	 * 而当前的进程可以处于睡眠，直到被其它事件唤醒。
        	 */
            timeo = schedule_timeout(timeo);
        }
        sched_annotate_sleep();
        lock_sock(sk);
        err = 0;
        if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
        {
            break;
        }
        err = -EINVAL;
        if (sk->sk_state != TCP_LISTEN)
        {
            break;
        }
        err = sock_intr_errno(timeo);
        if (signal_pending(current))
        {
            break;
        }
        err = -EAGAIN;
        if (!timeo)
        {
            break;
        }
    }
    /*! 下面把任务设置成TASK_RUNNING状态，然后把当前sock从等待队列中删除 */
    finish_wait(sk_sleep(sk), &wait);
    return err;
}

来自为知笔记(Wiz)

时间： 2024-11-09 00:52:13

linux内核之accept实现

linux内核之accept实现的相关文章

linux内核参数注释与优化

Linux内核中网络数据包的接收-第一部分概念和框架

Linux内核--网络栈实现分析（一）--网络栈初始化

Linux内核--基于Netfilter的内核级包过滤防火墙实现

nginx优化篇之Linux 内核参数的优化（2）

Linux内核Makefile文件(翻译自内核手册)

linux 内核参数优化

Linux内核工程导论——网络：Netfilter概览

Nginx优化指南+LINUX内核优化+linux连接数优化+nginx连接数优化