1. 背景:
前面通过抓包分析了listen backlog对全连接和半连接的影响,本文将从内核源码(kernel 2.6.32)上简单了解下服务端三次握手的过程以及backlog在中间所起的作用。
2. 三次握手:
2.1 服务端监听:
在system_call后,通过fd号获取相应的socket,及对backlog最值进行限制后,然后进入inet_listen函数进行处理。
int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { err = inet_csk_listen_start(sk, backlog); if (err) goto out; } sk->sk_max_ack_backlog = backlog; err = 0; out: release_sock(sk); return err; }
这函数进行简单的状态校验,然后进入inet_csk_listen_start初始化监听的套接字,最后注意,sk->sk_max_ack_backlog修改为backlog大小,这个变量后续将作用于sk_acceptq_is_full函数,用于判断半连接队列,因此backlog对于半连接是有作用的。
static inline int sk_acceptq_is_full(struct sock *sk) { return sk->sk_ack_backlog > sk->sk_max_ack_backlog; }
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); if (rc != 0) return rc; /* 初始化backlog为0,回到上层后赋值,见前文。 */ sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ /* 检查端口是否被占用。 */ sk->sk_state = TCP_LISTEN; if (!sk->sk_prot->get_port(sk, inet->num)) { inet->sport = htons(inet->num); sk_dst_reset(sk); sk->sk_prot->hash(sk); return 0; } sk->sk_state = TCP_CLOSE; __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; }
初始化主要初始化了backlog,在上层会被重新赋值为backlog,还检查监听的端口是否被占用。此外还要注意函数reqsk_queue_alloc函数,真正初始化了一个listen_sock。listen_sock数据结果见下面代码,主要开辟了一个哈希桶记录半连接的状态。
int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) { size_t lopt_size = sizeof(struct listen_sock); /** struct listen_sock - listen state * * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs */ /* struct listen_sock { /* 哈希大小。 */ u8 max_qlen_log; /* 3 bytes hole, try to use */ /* 用来记录半连接数量和未超时的半连接数量。 */ int qlen; int qlen_young; /* 将用于半连接超时重传。 */ int clock_hand; /* 用于计算哈希值。 */ u32 hash_rnd; u32 nr_table_entries; /* 柔性数组。 */ struct request_sock *syn_table[0]; }; listen_sock是一个柔性数组,是一个用来存放半连接的哈希表, 哈希表大小为backlog向上取2的幂次。 */ struct listen_sock *lopt; /* 柔性数组大小计算,在[8, sysctl_max_syn_backlog]之间的backlog向上取2的幂。 */ nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) lopt = __vmalloc(lopt_size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) return -ENOMEM; /* 哈希至少8。 */ for (lopt->max_qlen_log = 3; (1 << lopt->max_qlen_log) < nr_table_entries; lopt->max_qlen_log++); get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; write_unlock_bh(&queue->syn_wait_lock); return 0; }
2.2 第一次握手的syn到达:
当有报文到达,ipv4的tcp处理入口为tcp_v4_do_rcv,下面代码中去掉了一些校验的处理和已建立连接的处理,只留下监听的主要流程。第一次握手时候收到的是syn包,因此在tcp_v4_hnd_req之后走到tcp_rcv_state_process处理当前收到的包。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v4_hnd_req(sk, skb); /* 错误,丢弃包。 */ if (!nsk) goto discard; /* 收到ack. */ if (nsk != sk) { if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; } return 0; } } /* 处理报文。 */ TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } TCP_CHECK_TIMER(sk); return 0; reset: tcp_v4_send_reset(rsk, skb); discard: kfree_skb(skb); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, * but you have been warned. */ return 0; }
其中比较重要的函数是tcp_v4_hnd_req和tcp_rcv_state_process,先看一下tcp_v4_hnd_req。这函数返回入参sk说明当前是收到syn状态,直接进行包处理;当返回值不是sk且非空则表示当前处于最后一个ack,会创建一个新的sock来处理连接;如果是空则表示当前处理出错,将丢弃这个包。
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *nsk; struct request_sock **prev; /* 半连接中查找,找到的会进行检查,如果半连接中没有,查找已完成的连接,都不是则进入syn接收。 */ struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, iph->saddr, iph->daddr); if (req) return tcp_check_req(sk, skb, req, prev); nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { bh_lock_sock(nsk); return nsk; } inet_twsk_put(inet_twsk(nsk)); return NULL; } #ifdef CONFIG_SYN_COOKIES if (!th->rst && !th->syn && th->ack) sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); #endif return sk; }
关于函数tcp_check_req会对syn_recv进行检查,当前暂不介绍,如果半连接都没有建立,则两个表中都查找失败,直接返回sk。
接下来在处理包的函数tcp_rcv_state_process中的第一个状态机,进入监听状态且收到syn包的处理函数tcp_v4_conn_request。
简化一部分代码,保留主要流程如下:
1. 检查半连接队列是否已满,满的大小为backlog修正值向上取2的幂次,满的话会丢弃这个syn包。
2. 检查完成队列是否空,且新的请求数是不是大于1,注意,这里新的请求指的是刚刚发来syn,还未重传或者确认的请求,新的请求在tcp_check_req接收了ack进入连接状态以后会重置qlen_young,或者在第一次重传的时候重置,个人理解是把服务端的压力分摊到客户端上,让重传发生在客户端,减少服务端的压力。
3. 可以创建半连接,则会初始化一个request_sock,并发送syn + ack,同时将这个req添加到监听sock的syn_table中,下一次tcp_v4_hnd_req的时候会在半连接表中找到该链接。
4. 添加进哈希表以后会激活对应socket的keepalive事件,后面会分析。
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { ... /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ if (inet_csk_reqsk_queue_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; } else #endif goto drop; } /* Accept backlog is full. If we have already queued enough * of warm entries in syn queue, drop request. It is better than * clogging syn queue with openreqs with exponentially increasing * timeout. */ /* 当接收队列已满,且新的未重试请求数量大于1,服务端会暂时丢弃这个报文, * 个人理解是为了降低服务端的压力,将重传放到客户端(重传syn), * 而不是服务端(syn + ack)。 */ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) goto drop; #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; #endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = 536; tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; tcp_parse_options(skb, &tmp_opt, 0); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; if (!want_cookie) TCP_ECN_create_request(req, tcp_hdr(skb)); ... /* 发送syn + ack。 */ if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) goto drop_and_free; /* 添加req到半连接的syn_table中,同时启动一个超时检测的定时器。 */ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop_and_release: dst_release(dst); drop_and_free: reqsk_free(req); drop: return 0; }
第一次握手主要有一个细节,就是上面提到的,队列已满时候,会适当丢弃syn。
2.3 第三次握手:
第三次握手时候服务端收到客户端反馈的ack,这时候依旧是在tcp_v4_do_rcv中处理这个包,但是不同的是在tcp_v4_hnd_req的时候,会在syn_table中查找到req,这时候会进入tcp_check_req检查这个半连接。详细检查可以见参考文档[2],当检查完成后,会创建一个子sock用来处理连接,主要处理在tcp_v4_syn_recv_sock函数中完成。当子sock创建失败,如接受队列已满,则会根据系统的sysctl_tcp_abort_on_overflow标志判断是否需要向对端发送RST,或者是简单丢弃该包,等待后续的重传。
如果子sock建立成功了,则会从哈希桶中移除,且减少相应的半连接计数,移除相应的定时器。
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: if (!sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_reset(sk, skb); inet_csk_reqsk_queue_drop(sk, req, prev); return NULL; }
2.4 服务端的超时重传:
定时器总入口为tcp_keepalive_timer,对应的监听状态的入口为inet_csk_reqsk_queue_prune。
void inet_csk_reqsk_queue_prune(struct sock *parent, const unsigned long interval, const unsigned long timeout, const unsigned long max_rto) { struct inet_connection_sock *icsk = inet_csk(parent); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; int thresh = max_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; int i, budget; if (lopt == NULL || lopt->qlen == 0) return; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ /* 当半连接数量增大,但是young增大的速度比较平缓,这时候thresh会逐渐变小, * 半连接就更容易过期。 */ if (lopt->qlen>>(lopt->max_qlen_log-1)) { int young = (lopt->qlen_young<<1); while (thresh > 2) { if (lopt->qlen < young) break; thresh--; young <<= 1; } } if (queue->rskq_defer_accept) max_retries = queue->rskq_defer_accept; budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; /* 遍历桶。 */ do { reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { /* 已经达到超时时间。 */ if (time_after_eq(now, req->expires)) { int expire = 0, resend = 0; /* 超时和过期的计算。 */ syn_ack_recalc(req, thresh, max_retries, queue->rskq_defer_accept, &expire, &resend); req->rsk_ops->syn_ack_timeout(parent, req); if (!expire && (!resend || !inet_rtx_syn_ack(parent, req) || inet_rsk(req)->acked)) { /* 超时重传。 */ unsigned long timeo; /* 首次超时则这个请求不再是young。 */ if (req->num_timeout++ == 0) lopt->qlen_young--; /* 超时时间指数增加。 */ timeo = min(timeout << req->num_timeout, max_rto); req->expires = now + timeo; reqp = &req->dl_next; continue; } /* Drop this request */ /* 过期。 */ inet_csk_reqsk_queue_unlink(parent, req, reqp); reqsk_queue_removed(queue, req); reqsk_free(req); continue; } reqp = &req->dl_next; } i = (i + 1) & (lopt->nr_table_entries - 1); } while (--budget > 0); lopt->clock_hand = i; /* 刷新定时事件。 */ if (lopt->qlen) inet_csk_reset_keepalive_timer(parent, interval); }
3. 参考文献:
[1]. 函数调用关系:http://dedecms.com/knowledge/servers/linux-bsd/2012/1217/17745_3.html
[2].http://blog.csdn.net/zhangskd/article/details/17923917
[3]. 定时器:http://blog.csdn.net/zhangskd/article/details/35281345