2026-01-04,某些文章具有時(shí)效性,若有錯(cuò)誤或已失效,請(qǐng)?jiān)谙路?a href="#comment">留言或聯(lián)系老夜。詳解Socket狀態(tài)機(jī)源碼
系統(tǒng)調(diào)用方法

lock_scok()
lock_sock()本質(zhì)是調(diào)用了一次lock_sock_nested(sock, 0) 。
// net/core/sock.c
void lock_sock_nested(struct sock *sk, int subclass)
{
?/* The sk_lock has mutex_lock() semantics here. */
?mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
?might_sleep();
?spin_lock_bh(&sk->sk_lock.slock);
if?(sock_owned_by_user_nocheck(sk))
? __lock_sock(sk);
?sk->sk_lock.owned = 1;
?spin_unlock_bh(&sk->sk_lock.slock);
}
Listen()
最外層的listen()本質(zhì)是__sys_listen()方法,在該調(diào)用內(nèi)部啟用inet_listen()方法。 在這一層檢驗(yàn)監(jiān)聽Socket是否可用。
// include/net/ipv4/af_inet.c
int inet_listen(struct socket *sock, int backlog)
{
?struct sock *sk = sock->sk;
?int err = -EINVAL;
?lock_sock(sk);
if?(sock->state != SS_UNCONNECTED || sock->type?!= SOCK_STREAM)
? goto out;
? ? // 下層邏輯調(diào)用
?err = __inet_listen_sk(sk, backlog);
out:
?release_sock(sk);
return?err;
}
__inet_listen_sk()則是寫入backlog,并且對(duì)非TCP_LISTEN狀態(tài)的Socket,通過inet_csk_listen_start()進(jìn)行狀態(tài)流轉(zhuǎn)。
old_state != TCP_LISTEN是用于區(qū)分listen()的系統(tǒng)調(diào)用是否被重復(fù)觸發(fā)的邏輯。這是因?yàn)長(zhǎng)inux源碼在inet_csk_listen_start()中執(zhí)行了內(nèi)存綁定,隊(duì)列初始化等操作,在初始化一次監(jiān)聽Socket后,防止在之后的listen()中被再次調(diào)用強(qiáng)行清空連接隊(duì)列。
// include/net/ipv4/af_inet.c
int __inet_listen_sk(struct sock *sk, int backlog)
{
?unsigned char old_state = sk->sk_state;
?int err, tcp_fastopen;
if?(!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
return?-EINVAL;
?WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
?/* Really,?if?the socket is already?in?listen state
? * we can only allow the backlog to be adjusted.
? */
if?(old_state != TCP_LISTEN) {
? /* Enable TFO w/o requiring TCP_FASTOPEN socket option.
? ?* Note that only TCP sockets (SOCK_STREAM) will reach here.
? ?* Also fastopen backlog may already been?set?via the option
? ?* because the socket was?in?TCP_LISTEN state previously but
? ?* was shutdown() rather than close().
? ?*/
? tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
if?((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
? ? ? (tcp_fastopen & TFO_SERVER_ENABLE) &&
? ? ? !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
? ?fastopen_queue_tune(sk, backlog);
? ?tcp_fastopen_init_key_once(sock_net(sk));
? }
? err = inet_csk_listen_start(sk);
if?(err)
return?err;
? tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
?}
return?0;
}
inet_csk_listen_start()是最底層的listen()系統(tǒng)調(diào)用實(shí)現(xiàn)。內(nèi)核在這一步主要進(jìn)行了以下操作:
● 初始化TCP FastOpen隊(duì)列和全連接隊(duì)列
● 清空全連接隊(duì)列的連接計(jì)數(shù)器
● 切換監(jiān)聽Socket的狀態(tài) ->?TCP_LISTEN
● 放入listen hash表
listen hash表并不是全局ehash連接表,也不是bhash端口連接表,而是存放所有監(jiān)聽Socket的哈希表。 經(jīng)過這一步正式將監(jiān)聽Socket注冊(cè),這個(gè)Socket和對(duì)應(yīng)的端口才算是正式可用。未經(jīng)過這一步的Socket對(duì)操作系統(tǒng)是不可見的。
// net/ipv4/inet_connection_sock.c
int inet_csk_listen_start(struct sock *sk)
{
?struct inet_connection_sock *icsk = inet_csk(sk);
?struct inet_sock *inet = inet_sk(sk);
?int err;
?err = inet_ulp_can_listen(sk);
if?(unlikely(err))
return?err;
? ? // 初始化 連接隊(duì)列容器
?reqsk_queue_alloc(&icsk->icsk_accept_queue);
? ? // 清空全連接隊(duì)列元素?cái)?shù)量
?sk->sk_ack_backlog = 0;
?inet_csk_delack_init(sk);
?/* There is race window here: we announce ourselves listening,
? * but this transition is still not validated by get_port().
? * It is OK, because this socket enters to?hash?table only
? * after validation is complete.
? */
? ? // 切換sk狀態(tài)
?inet_sk_state_store(sk, TCP_LISTEN);
? ? // 檢查端口號(hào)
?err = sk->sk_prot->get_port(sk, inet->inet_num);
if?(!err) {
? inet->inet_sport = htons(inet->inet_num);
? ? ? ? // 清除Cache
? sk_dst_reset(sk);
? ? ? ? // 放入listen?hash表
? err = sk->sk_prot->hash(sk);
if?(likely(!err))
return?0;
?}
?inet_sk_set_state(sk, TCP_CLOSE);
return?err;
}
// net/core/request_sock.c
void reqsk_queue_alloc(struct request_sock_queue *queue)
{
? ? // 初始化FastOpen隊(duì)列
?queue->fastopenq.rskq_rst_head = NULL;
?queue->fastopenq.rskq_rst_tail = NULL;
?queue->fastopenq.qlen = 0;
? ? // 初始化全連接隊(duì)列
?queue->rskq_accept_head = NULL;
}
三次握手 (狀態(tài)機(jī)流轉(zhuǎn)全流程)
初始化TCP Socket
inet_init -> .init = tcp_v4_init_sock()通過?tcp_v4_init_sock()?初始化了一個(gè)TCP Socket,提供了一個(gè)ipv4_specific接口用于封裝TCP協(xié)議Socket連接的所有處理。如果是ipv6則會(huì)返回一個(gè)ipv6_specific。
體現(xiàn)的設(shè)計(jì)理念: TCP層不感知和區(qū)分ipv4還是ipv6,只需調(diào)用**ops->function()**,即下面的af_ops
// net/ipv4/tcp_ipv4.c
static int tcp_v4_init_sock(struct sock *sk)
{
?struct inet_connection_sock *icsk = inet_csk(sk);
?tcp_init_sock(sk);
?icsk->icsk_af_ops = &ipv4_specific;
#if?defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
?tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
?sk->sk_destruct = tcp4_destruct_sock;
#endif
return?0;
}
const struct inet_connection_sock_af_ops ipv4_specific = {
? ? // 發(fā)送數(shù)據(jù)的函數(shù)。用于將數(shù)據(jù)從傳輸層(TCP)發(fā)送到網(wǎng)絡(luò)層(IP)
?.queue_xmit ? ?= ip_queue_xmit,
? ? // 用于計(jì)算和校驗(yàn)的函數(shù)
?.send_check ? ?= tcp_v4_send_check,
?.rebuild_header ? ?= inet_sk_rebuild_header,
?.sk_rx_dst_set ? ?= inet_sk_rx_dst_set,
? ? // 處理SYN段的函數(shù)。在TCP三次握手的開始階段被調(diào)用,用于處理來自客戶端的SYN包
?.conn_request ? ?= tcp_v4_conn_request,
? ? // 創(chuàng)建和初始化新socket的函數(shù) 在TCP三次握手完成后被調(diào)用
?.syn_recv_sock ? ?= tcp_v4_syn_recv_sock,
?.net_header_len ? ?= sizeof(struct iphdr),
?.setsockopt ? ?= ip_setsockopt,
?.getsockopt ? ?= ip_getsockopt,
?.mtu_reduced ? ?= tcp_v4_mtu_reduced,
};
服務(wù)端
1 · 接收SYN報(bào)文
TCP Socket處理第一次SYN請(qǐng)求的方法為tcp_v4_conn_request(),其中調(diào)用了tcp_conn_request()方法。
// net/ipv4/tcp_input.c
int tcp_conn_request(struct request_sock_ops *rsk_ops,
? ? ? ?const struct tcp_request_sock_ops *af_ops,
? ? ? ?struct sock *sk, struct sk_buff *skb)
{
?...
?isn = __this_cpu_read(tcp_tw_isn);
if?(isn) {
? ...
?}?else?{
? ? ? ? // SYN-Cookie功能,即使超出半連接隊(duì)列容量也不會(huì)drop連接
? syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
? ? ? ? // inet_csk_reqsk_queue_is_full()判斷半連接隊(duì)列是否滿了
if?(syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) {
? ?want_cookie = tcp_syn_flood_action(sk,
? ? ? ? ? rsk_ops->slab_name);
if?(!want_cookie)
? ? goto drop;
? }
?}
? ? // 判斷全連接隊(duì)列是否滿了
if?(sk_acceptq_is_full(sk)) {
? NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
? goto drop;
?}
? ? // 創(chuàng)建一個(gè)request_sock !!! 核心代碼見下文
?req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if?(!req)
? goto drop;
?req->syncookie = want_cookie;
?tcp_rsk(req)->af_specific = af_ops;
?tcp_rsk(req)->ts_off = 0;
?tcp_rsk(req)->req_usec_ts =?false;
? ? ...?
? ? // 省略了大量cookie校驗(yàn)邏輯
? ? ...
if?(fastopen_sk) {
? ? ? ? // 發(fā)送syn-ack應(yīng)答報(bào)文
? af_ops->send_synack(fastopen_sk, dst, &fl, req,
? ? ? ? &foc, TCP_SYNACK_FASTOPEN, skb);
? ? ? ? // 若SYN-Cookie下的Cookie可信任,則直接入全連接隊(duì)列
? /* Add the child socket directly into the accept queue */
if?(!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
? ?bh_unlock_sock(fastopen_sk);
? ?sock_put(fastopen_sk);
? ?goto drop_and_free;
? }
? ? ? ? //?
? sk->sk_data_ready(sk);
? bh_unlock_sock(fastopen_sk);
? sock_put(fastopen_sk);
?}?else?{
? tcp_rsk(req)->tfo_listener =?false;
if?(!want_cookie &&
? ? ? ? ? ? // 將request_sock添加到半連接隊(duì)列
? ? ? unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) {
? ?reqsk_free(req);
? ?dst_release(dst);
return?0;
? }
? af_ops->send_synack(sk, dst, &fl, req, &foc,
? ? ? ? !want_cookie ? TCP_SYNACK_NORMAL :
? ? ? ? ?TCP_SYNACK_COOKIE,
? ? ? ? skb);
? ...
?reqsk_put(req);
return?0;
...
}
// net/ipv4/inet_connection_sock.c
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
? ? ? ? ? struct sock *sk_listener,
? ? ? ? ? bool attach_listener)
{
? ? // 創(chuàng)建request_sock 綁定listener和處理接口ops
?struct request_sock *req = reqsk_alloc(ops, sk_listener,
? ? ? ? ? ? attach_listener);
if?(req) {
? struct inet_request_sock *ireq = inet_rsk(req);
? ireq->ireq_opt = NULL;
#if?IS_ENABLED(CONFIG_IPV6)
? ireq->pktopts = NULL;
#endif
? atomic64_set(&ireq->ir_cookie, 0);
? ? ? ? // 修改狀態(tài)為TCP_NEW_SYN_RECV?
? ireq->ireq_state = TCP_NEW_SYN_RECV;
? write_pnet(&ireq->ireq_net, sock_net(sk_listener));
? ireq->ireq_family = sk_listener->sk_family;
?}
return?req;
}
通過源碼可知?創(chuàng)建request_sock?->?request_sock入隊(duì)半連接隊(duì)列?->?發(fā)送SYN-ACK報(bào)文的順序?。
并且創(chuàng)建的request_sock在創(chuàng)建出來后的狀態(tài)為?TCP_NEW_SYN_RECV,這是個(gè)重要的細(xì)節(jié)。
2 · 接收ACK報(bào)文
根據(jù) “初始化TCP Socket” 階段的af_ops,其中的tcp_v4_syn_recv_sock定義了接受ACK后的處理邏輯。 而tcp_v4_syn_recv_sock中調(diào)用了?tcp_create_openreq_child,實(shí)現(xiàn)了?TCP_SYN_RECV的狀態(tài)。
// net/ipv4/tcp_minisocks.c
struct sock *tcp_create_openreq_child(const struct sock *sk,
? ? ? ? ? struct request_sock *req,
? ? ? ? ? struct sk_buff *skb)
{
? ? // 通過request_sock創(chuàng)建新的完整的sock
?struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
?const struct inet_request_sock *ireq = inet_rsk(req);
?struct tcp_request_sock *treq = tcp_rsk(req);
?struct inet_connection_sock *newicsk;
?const struct tcp_sock *oldtp;
?struct tcp_sock *newtp;
?u32 seq;
?...
?smc_check_reset_syn_req(oldtp, req, newtp);
?/* Now setup tcp_sock */
?...
if?(sock_flag(newsk, SOCK_KEEPOPEN))
? tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
?newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
?newtp->rx_opt.sack_ok = ireq->sack_ok;
?newtp->window_clamp = req->rsk_window_clamp;
?newtp->rcv_ssthresh = req->rsk_rcv_wnd;
?newtp->rcv_wnd = req->rsk_rcv_wnd;
?newtp->rx_opt.wscale_ok = ireq->wscale_ok;
?// 省略大量對(duì)newtp的賦值,包括tcp握手確定的序列號(hào),窗口大小等屬性
?...
?newtp->bpf_chg_cc_inprogress = 0;
?tcp_bpf_clone(sk, newsk);
?__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
?xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1);
return?newsk;
}
// net/ipv4/incet_connection_sock.c
struct sock *inet_csk_clone_lock(const struct sock *sk,
? ? ?const struct request_sock *req,
? ? ?const gfp_t priority)
{
?struct sock *newsk = sk_clone_lock(sk, priority);
?struct inet_connection_sock *newicsk;
?struct inet_request_sock *ireq;
?struct inet_sock *newinet;
if?(!newsk)
return?NULL;
?newicsk = inet_csk(newsk);
?newinet = inet_sk(newsk);
?ireq = inet_rsk(req);
?newicsk->icsk_bind_hash = NULL;
?...
? ? // 省略大量newicsk的賦值邏輯
? ? ...
?/* Deinitialize accept_queue to?trap?illegal accesses. */
?memset(&newicsk->icsk_accept_queue, 0,
? ? ? ? sizeof(newicsk->icsk_accept_queue));
? ? // 設(shè)置狀態(tài)
?inet_sk_set_state(newsk, TCP_SYN_RECV);
?inet_clone_ulp(req, newsk, priority);
?security_inet_csk_clone(newsk, req);
return?newsk;
}
可以發(fā)現(xiàn),通過request_sock創(chuàng)建的sock的狀態(tài)被設(shè)置為了TCP_SYN_RECV。 說明TCP_SYN_RECV并不是第一次接受SYN報(bào)文后的狀態(tài),也不是request_sock的狀態(tài),而是三次握手后完整的sock的狀態(tài)。
3 · 處理ACK報(bào)文
實(shí)際上會(huì)發(fā)現(xiàn),上述的所有函數(shù)都是通過初始化的TCP Socket的af_ops執(zhí)行,而af_ops則是通過Socket的tcp_rcv_state_process方法進(jìn)行調(diào)用。
tcp_rcv_state_process方法是tcp_v4_rcv下的狀態(tài)機(jī)流轉(zhuǎn)邏輯,其中通過switch(sk->sk.state)定義了不同狀態(tài)下的Socket接受TCP請(qǐng)求后狀態(tài)流轉(zhuǎn)的邏輯。
tcp_v4_syn_recv_sock統(tǒng)轄?創(chuàng)建sock?,?放入全連接隊(duì)列,?流轉(zhuǎn)狀態(tài)這幾個(gè)步驟,確保他們的時(shí)序性。
// net/ipv4/tcp_input.c
enum skb_drop_reason
tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
?struct tcp_sock *tp = tcp_sk(sk);
?struct inet_connection_sock *icsk = inet_csk(sk);
?const struct tcphdr *th = tcp_hdr(skb);
?struct request_sock *req;
?int queued = 0;
?SKB_DR(reason);
?switch (sk->sk_state) {
case?TCP_CLOSE:
? SKB_DR_SET(reason, TCP_CLOSE);
? goto discard;
? ? // 接受第一個(gè)SYN報(bào)文的處理
case?TCP_LISTEN:
if?(th->ack)
return?SKB_DROP_REASON_TCP_FLAGS;
if?(th->rst) {
? ?SKB_DR_SET(reason, TCP_RESET);
? ?goto discard;
? }
if?(th->syn) {
if?(th->fin) {
? ? SKB_DR_SET(reason, TCP_FLAGS);
? ? goto discard;
? ?}
? ?/* It is possible that we process SYN packets from backlog,
? ? * so we need to make sure to?disable?BH and RCU right there.
? ? */
? ?rcu_read_lock();
? ?local_bh_disable();
? ? ? ? ? ? // 通過af_ops調(diào)用conn_request邏輯
? ?icsk->icsk_af_ops->conn_request(sk, skb);
? ?local_bh_enable();
? ?rcu_read_unlock();
? ?consume_skb(skb);
return?0;
? }
? SKB_DR_SET(reason, TCP_FLAGS);
? goto discard;
case?TCP_SYN_SENT:
? ...
?}
?...
?/* step 5: check the ACK field */
?reason = tcp_ack(sk, skb, FLAG_SLOWPATH |
? ? ? FLAG_UPDATE_TS_RECENT |
? ? ? FLAG_NO_CHALLENGE_ACK);
?...
? ? //省略了RST重試請(qǐng)求的部分和等待發(fā)送方重新發(fā)送請(qǐng)求的部分
? ? ...
? ? // 檢查是否有af_ops處理接口 ipv4_specified
?SKB_DR_SET(reason, NOT_SPECIFIED);
?switch (sk->sk_state) {
case?TCP_SYN_RECV:
? tp->delivered++;?
if?(!tp->srtt_us)
? ?tcp_synack_rtt_meas(sk, req);
if?(tp->rx_opt.tstamp_ok)
? ?tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
if?(req) {
? ?tcp_rcv_synrecv_state_fastopen(sk);
? }?else?{
? ?tcp_try_undo_spurious_syn(sk);
? ?tp->retrans_stamp = 0;
? ?tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
? ? ? ?skb);
? ?WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
? }
? tcp_ao_established(sk);
? smp_mb();
? ? ? ? // TCP_SYN_RECV -> TCP_ESTABLISHED
? tcp_set_state(sk, TCP_ESTABLISHED);
? sk->sk_state_change(sk);
? ? ? ? ...
? /* Prevent spurious tcp_cwnd_restart() on first data packet */
? tp->lsndtime = tcp_jiffies32;
? tcp_initialize_rcv_mss(sk);
if?(tcp_ecn_mode_accecn(tp))
? ?tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt);
? tcp_fast_path_on(tp);
if?(sk->sk_shutdown & SEND_SHUTDOWN)
? ?tcp_shutdown(sk, SEND_SHUTDOWN);
break;
?...
?/* step 6: check the URG bit */
?tcp_urg(sk, skb, th);
? ? // 揮手邏輯
?/* step 7: process the segment text */
?switch (sk->sk_state) {
case?TCP_CLOSE_WAIT:
case?TCP_CLOSING:
case?TCP_LAST_ACK:
? ? ? ? ? ? ...
case?TCP_FIN_WAIT1:
case?TCP_FIN_WAIT2:
? ...
}
客戶端
1 · 發(fā)送SYN報(bào)文
通過調(diào)用tcp_v4_connect確定請(qǐng)求的四元組信息,其內(nèi)部的tcp_connect則執(zhí)行具體的tcp層的校驗(yàn)和發(fā)送邏輯。需要注意的是,TCP_CLOSE?->?TCP_SYN_SENT的狀態(tài)流轉(zhuǎn)發(fā)生在SYN包發(fā)送前。
int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
?struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
?struct inet_timewait_death_row *tcp_death_row;
?struct inet_sock *inet = inet_sk(sk);
?struct tcp_sock *tp = tcp_sk(sk);
?struct ip_options_rcu *inet_opt;
?struct net *net = sock_net(sk);
?...
? ? // 檢查路由,找出去往目的IP的網(wǎng)卡網(wǎng)關(guān)
?rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
? ? ? ? ?sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
? ? ? ? ?orig_dport, sk);
if?(IS_ERR(rt)) {
? err = PTR_ERR(rt);
if?(err == -ENETUNREACH)
? ?IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return?err;
?}
?...
? ? // 省略了確定目標(biāo)IP的部分
?inet->inet_dport = usin->sin_port;
?sk_daddr_set(sk, daddr);
?inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
if?(inet_opt)
? inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
?tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
?/* Socket identity is still unknown (sport may be zero).
? * However we?set?state to SYN-SENT and not releasing socket
? * lock select?source?port, enter ourselves into the?hash?tables and
? * complete initialization after this.
? */
? ? // 修改Socket狀態(tài)?
?tcp_set_state(sk, TCP_SYN_SENT);
? ? // 確定源端口 若未bind,則內(nèi)核分配一個(gè)臨時(shí)端口
?err = inet_hash_connect(tcp_death_row, sk);
if?(err)
? goto failure;
?sk_set_txhash(sk);
? ? // 使用新端口重新路由
?rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
? ? ? ? ? inet->inet_sport, inet->inet_dport, sk);
?tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
?/* OK, now commit destination to socket. ?*/
?sk->sk_gso_type = SKB_GSO_TCPV4;
?sk_setup_caps(sk, &rt->dst);
?rt = NULL;
? ? ...
?atomic_set(&inet->inet_id, get_random_u16());
if?(tcp_fastopen_defer_connect(sk, &err))
return?err;
if?(err)
? goto failure;
? ? // 構(gòu)建并發(fā)送SYN包
?err = tcp_connect(sk);
if?(err)
? goto failure;
return?0;
failure:
?...
}
// net/ipv4/tcp_output.c
int tcp_connect(struct sock *sk)
{
?struct tcp_sock *tp = tcp_sk(sk);
?struct sk_buff *buff;
?int err;
?tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
?...
? ? // 省略了對(duì)目標(biāo)IP的校驗(yàn)代碼
? ? ...
? ? // AO校驗(yàn)密鑰不安全情況下的終止代碼
if?(inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return?-EHOSTUNREACH; /* Routing failure or similar. */
? ? // 初始化發(fā)送Socket
?tcp_connect_init(sk);
if?(unlikely(tp->repair)) {
? tcp_finish_connect(sk, NULL);
return?0;
?}
?buff = tcp_stream_alloc_skb(sk, sk->sk_allocation,?true);
if?(unlikely(!buff))
return?-ENOBUFS;
?/* SYN eats a sequence byte, write_seq updated by
? * tcp_connect_queue_skb().
? */
?tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN);
?...
?/* Send off SYN; include data?in?Fast Open. */
? ? // 發(fā)送SYN包給IP層
?err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
? ? ? ?tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
if?(err == -ECONNREFUSED)
return?err;
?/* We change tp->snd_nxt after the tcp_transmit_skb() call
? *?in?order to make this packet get counted?in?tcpOutSegs.
? */
?WRITE_ONCE(tp->snd_nxt, tp->write_seq);
?tp->pushed_seq = tp->write_seq;
?buff = tcp_send_head(sk);
if?(unlikely(buff)) {
? WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
? tp->pushed_seq = TCP_SKB_CB(buff)->seq;
?}
?TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
? ? // 啟動(dòng)Timer計(jì)時(shí)器
?/* Timer?for?repeating the SYN until an answer. */
?tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
? ? ? ? inet_csk(sk)->icsk_rto,?false);
return?0;
}
2 · 接收SYN-ACK報(bào)文
接受方的調(diào)用邏輯跟服務(wù)端“接收SYN-ACK報(bào)文”的過程較為相似。調(diào)用鏈為:tcp_rcv_state_process?->?tcp_rcv_synsent_state_process?->?tcp_finish_connect并在最后的tcp_finish_connect中實(shí)現(xiàn)了狀態(tài)的流轉(zhuǎn)?TCP_SYN_SENT?->?TCP_ESTABLISHED
// net/ipv4/tcp_input.c
enum skb_drop_reason
tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
...
case?TCP_SYN_SENT:
? tp->rx_opt.saw_tstamp = 0;
? tcp_mstamp_refresh(tp);
? ? ? ? // 狀態(tài)流轉(zhuǎn): TCP_SYN_SENT -> TCP
? queued = tcp_rcv_synsent_state_process(sk, skb, th);
if?(queued >= 0)
return?queued;
? /* Do step6 onward by hand. */
? tcp_urg(sk, skb, th);
? __kfree_skb(skb);
? tcp_data_snd_check(sk);
return?0;
...
?}
// net/ipv4/tcp_input.c
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
? ? ? const struct tcphdr *th)
{
?struct inet_connection_sock *icsk = inet_csk(sk);
?struct tcp_sock *tp = tcp_sk(sk);
?struct tcp_fastopen_cookie foc = { .len = -1 };
?int saved_clamp = tp->rx_opt.mss_clamp;
?bool fastopen_fail;
?SKB_DR(reason);
?tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
if?(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
? tp->rx_opt.rcv_tsecr -= tp->tsoffset;
if?(th->ack) {
? ? ? ? ...
? // 省略校驗(yàn)ACK序列號(hào)和時(shí)間戳的部分
? ? ? ? ...
? ? ? ? // 省略了rst報(bào)文的重置連接部分
? WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
? tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
? /* RFC1323: The window?in?SYN & SYN/ACK segments is
? ?* never scaled.
? ?*/
? tp->snd_wnd = ntohs(th->window);
? ? ? ? ...
? ? ? ? // 省略了tcp請(qǐng)求頭的校驗(yàn)
? tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
? tcp_initialize_rcv_mss(sk);
? /* Remember, tcp_poll() does not lock socket!
? ?* Change state from SYN-SENT only after copied_seq
? ?* is initialized. */
? WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
? smc_check_reset_syn(tp);
? smp_mb();
? ? ? ? // 連接完成的狀態(tài)流轉(zhuǎn)
? tcp_finish_connect(sk, skb);
? fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
? ? tcp_rcv_fastopen_synack(sk, skb, &foc);
? ? ? ? ...
?}
?/* No ACK?in?the segment */
if?(th->rst) {
? ...
?}
? ? ...
if?(th->syn) {
? ? ? ? // 這里體現(xiàn)了"雙向連接"的處理邏輯
? ? ? ? // 即客戶端等待服務(wù)端回復(fù)SYN-ACK的過程中,得到了僅有SYN的報(bào)文
? ...
? tcp_set_state(sk, TCP_SYN_RECV);
? ...
? tcp_send_synack(sk);
#if?0
return?-1;
#else
? goto consume;
#endif
?}
// net/ipv4/tcp_input.c
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
?struct tcp_sock *tp = tcp_sk(sk);
?struct inet_connection_sock *icsk = inet_csk(sk);
? ? // 連接簡(jiǎn)歷完成處理
?tcp_ao_finish_connect(sk, skb);
? ? // 狀態(tài)機(jī)流轉(zhuǎn) TCP_SYN_SENT -> TCP_ESTABLISHED
?tcp_set_state(sk, TCP_ESTABLISHED);
?icsk->icsk_ack.lrcvtime = tcp_jiffies32;
if?(skb) {
? icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
? security_inet_conn_established(sk, skb);
? sk_mark_napi_id(sk, skb);
?}
?tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
? ? ...
}
sock的類型
下述幾種sock類型,可以當(dāng)作是父類-子類關(guān)系,C語(yǔ)言中結(jié)構(gòu)體里的內(nèi)存是連續(xù)的,將要繼承的”父類”,放到結(jié)構(gòu)體的第一位,然后就可以通過強(qiáng)制轉(zhuǎn)換進(jìn)行繼承訪問。


sock
所有sock類的基底,基礎(chǔ)的數(shù)據(jù)結(jié)構(gòu),用于維護(hù)任何協(xié)議都使用到的數(shù)據(jù)收發(fā)緩沖區(qū)。
// include\net\sock.h
struct sock {
? ? struct sock_common ?__sk_common;
? ? socket_lock_t ? ? ? sk_lock;
? ? atomic_t ? ? ? ?sk_drops;
? ? int ? ? ? ? sk_rcvlowat;
? ? struct sk_buff_head sk_error_queue;
? ? struct sk_buff ? ? ?*sk_rx_skb_cache;
? ? struct sk_buff_head sk_receive_queue;
? ? ...
? ? struct proto ?*skc_prot;
? ? ...
}
常說的socket()系統(tǒng)調(diào)用,本質(zhì)就是創(chuàng)建了一個(gè)struct sock的結(jié)構(gòu)體。同時(shí)由于Linux一切皆文件的設(shè)計(jì)理念,還需要對(duì)struct sock和struct file進(jìn)行一個(gè)映射操作。__sys_socket?->?sock_create?->?__sock_create?->?pf->create?->?inet_create其中涉及了RCU(Read-Copy Update) 的理念,用于適應(yīng)socket高頻讀取低頻寫入的場(chǎng)景。 這里”Copy”其實(shí)就是指讀寫隔離的實(shí)現(xiàn)方式。寫操作不會(huì)原地修改,而是copy一份新的數(shù)據(jù)后進(jìn)行修改(保證原子性)。而讀操作會(huì)讀取舊數(shù)據(jù)。并且他的刪除是延遲釋放,socket的內(nèi)存不會(huì)在close()后立刻釋放,而是后臺(tái)靜默延遲處理。
inet_sock
特指用了網(wǎng)絡(luò)傳輸功能的sock,在sock的基礎(chǔ)上還加入了TTL,端口,IP地址這些跟網(wǎng)絡(luò)傳輸相關(guān)的字段信息。
struct inet_sock {
?/* sk and pinet6 has to be the first two members of inet_sock */
?struct sock ?sk;
#if?IS_ENABLED(CONFIG_IPV6)
?struct ipv6_pinfo *pinet6;
?struct ipv6_fl_socklist __rcu *ipv6_fl_list;
#endif
?/* Socket demultiplex comparisons on incoming packets. */
#define?inet_daddr ?sk.__sk_common.skc_daddr
#define?inet_rcv_saddr ?sk.__sk_common.skc_rcv_saddr
#define?inet_dport ?sk.__sk_common.skc_dport
#define?inet_num ?sk.__sk_common.skc_num
?unsigned long ?inet_flags;
?__be32 ? inet_saddr;
?__s16 ? uc_ttl;
?__be16 ? inet_sport;
?struct ip_options_rcu __rcu *inet_opt;
?atomic_t ?inet_id;
?__u8 ? min_ttl;
?__u8 ? mc_ttl;
?...
?struct ip_mc_socklist __rcu *mc_list;
?struct inet_cork_full cork;
};
inet_connection_sock
在inet_sock的基礎(chǔ)上面向連接的sock結(jié)構(gòu)體,增加了連接相關(guān)的字段屬性,比如accept隊(duì)列,握手失敗重試次數(shù),數(shù)據(jù)包分片大小等信息。源碼中經(jīng)常看到的 icsk 前綴其實(shí)就是指inet_connection_sock
// include/net/inet_connection_sock.h
struct inet_connection_sock {
?/* inet_sock has to be the first member! */
?struct inet_sock ? icsk_inet;
?struct request_sock_queue icsk_accept_queue;
?struct inet_bind_bucket ? *icsk_bind_hash;
?struct inet_bind2_bucket ?*icsk_bind2_hash;
?struct timer_list ? icsk_delack_timer;
?union {
? struct timer_list icsk_keepalive_timer;
? struct timer_list mptcp_tout_timer;
?};
?__u32 ? ? icsk_rto;
?__u32 ? ? ? ? ? ? ? ? ? ? icsk_rto_min;
?u32 ? ? icsk_rto_max;
?__u32 ? ? ? ? ? ? ? ? ? ? icsk_delack_max;
?__u32 ? ? icsk_pmtu_cookie;
?const struct tcp_congestion_ops *icsk_ca_ops;
?const struct inet_connection_sock_af_ops *icsk_af_ops;
?const struct tcp_ulp_ops ?*icsk_ulp_ops;
?void __rcu ? ?*icsk_ulp_data;
?...
};
tcp_sock
tcp_sock就是tcp協(xié)議專用的sock結(jié)構(gòu),在inet_connection_sock基礎(chǔ)上還加入了tcp特有的滑動(dòng)窗口、擁塞避免等功能。
// include/linux/tcp.h
struct tcp_sock {
? ? /* inet_connection_sock has to be the first member of tcp_sock */
? ? struct inet_connection_sock inet_conn;
? ? u16 tcp_header_len; /* Bytes of tcp header to send ?*/
? ? u16 gso_segs; /* Max number of segs per GSO packet */
? ? ...
? ? u32 snd_wnd; /* The window we expect to receive */
? ? u32 max_window; /* Maximal window ever seen from peer */
? ? ...
? ? u32 snd_cwnd; /* Sending congestion window ?*/
? ? u32 snd_cwnd_cnt; /* Linear increase counter ?*/
? ? ...
}
半連接隊(duì)列何去何從?
在inet_listen()中,只有對(duì)全連接隊(duì)列和FastOpen隊(duì)列的初始化,沒有半連接隊(duì)列的相關(guān)處理。同時(shí)sock類中也只有全連接鏈表的頭和尾,沒有半連接隊(duì)列的操作符。這是因?yàn)?/span>獨(dú)立的半連接隊(duì)列已經(jīng)不存在了,轉(zhuǎn)變?yōu)榱艘环N概念。 服務(wù)端接收SYN報(bào)文后創(chuàng)建的request_sock插入到全局連接表ehash中進(jìn)行管理,監(jiān)聽sock中只保留young和qlen參數(shù)進(jìn)行半連接隊(duì)列容量的監(jiān)控。tcp_conn_request()?->?inet_csk_reqsk_queue_hash_add()?->?reqsk_queue_hash_req>reqsk_queue_hash_req->?inet_ehash_insert
// net/ipv4/tcp_input.c/tcp_conn_request()
if?(fastopen_sk) {
? ? ? ? // FastOpen連接,一次SYN直接入全連接隊(duì)列完成握手
? af_ops->send_synack(fastopen_sk, dst, &fl, req,
? ? ? ? &foc, TCP_SYNACK_FASTOPEN, skb);
? /* Add the child socket directly into the accept queue */
if?(!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
? ?bh_unlock_sock(fastopen_sk);
? ?sock_put(fastopen_sk);
? ?goto drop_and_free;
? }
? sk->sk_data_ready(sk);
? bh_unlock_sock(fastopen_sk);
? sock_put(fastopen_sk);
?}?else?{
? ? ? ? // 入半連接隊(duì)列后發(fā)送SYN-ACK報(bào)文
? tcp_rsk(req)->tfo_listener =?false;
if?(!want_cookie &&
? ? ? unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) {
? ?reqsk_free(req);
? ?dst_release(dst);
return?0;
? }
? ? ? ? // 通過操作符發(fā)送SYN-ACK報(bào)文
? af_ops->send_synack(sk, dst, &fl, req, &foc,
? ? ? ? !want_cookie ? TCP_SYNACK_NORMAL :
? ? ? ? ?TCP_SYNACK_COOKIE,
? ? ? ? skb);
if?(want_cookie) {
? ?reqsk_free(req);
return?0;
? }
?}
// net/ipv4/inet_connection_sock.c
bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req)
{
? ? // 遞進(jìn)
if?(!reqsk_queue_hash_req(req))
returnfalse;
?inet_csk_reqsk_queue_added(sk);
returntrue;
}
// net/ipv4/inect_connection_sock.c
static bool reqsk_queue_hash_req(struct request_sock *req)
{
?bool found_dup_sk =?false;
? ? // 插入全局ehash表
if?(!inet_ehash_insert(req_to_sk(req), NULL, &found_dup_sk))
returnfalse;
?/* The timer needs to be setup after a successful insertion. */
?req->timeout = tcp_timeout_init((struct sock *)req);
?timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
?mod_timer(&req->rsk_timer, jiffies + req->timeout);
?/* before letting lookups find us, make sure all req fields
? * are committed to memory and refcnt initialized.
? */
?smp_wmb();
?refcount_set(&req->rsk_refcnt, 2 + 1);
returntrue;
}
listen socket 的 struct sock 數(shù)據(jù)結(jié)構(gòu)?inet_connection_sock。
● 全連接隊(duì)列和半連接隊(duì)列最大長(zhǎng)度: inet_connection_sock.icsk_inet.sock.sk_max_ack_backlog
● 全連接隊(duì)列: inet_connection_sock.icsk_accept_queue.rskq_accept_head
● 當(dāng)前全連接隊(duì)列長(zhǎng)度: inet_connection_sock.icsk_inet.sock.sk_ack_backlog
● 半連接隊(duì)列(哈希表): inet_hashinfo.inet_ehash_bucket
● 當(dāng)前半連接隊(duì)列長(zhǎng)度: inet_connection_sock.icsk_accept_queue.qlen
拓展思考
源碼如何體現(xiàn)request_sock向sock的轉(zhuǎn)變?
不存在直接的轉(zhuǎn)變,而是創(chuàng)建一個(gè)sock掛載到原來的request_sock下。
全連接隊(duì)列的sk_ack_backlog和sk_max_ack_backlog分別是什么意思?
sk_ack_backlog是當(dāng)前全連接隊(duì)列中的元素?cái)?shù)量,sk_max_ack_backlog是全連接隊(duì)列和半連接隊(duì)列的最大長(zhǎng)度。
什么是FastOpen?FastOpen隊(duì)列有什么用?
用于進(jìn)行SYN Cookie模式下的交互。這是Google提出的一種通信機(jī)制,在原本不攜帶數(shù)據(jù)信息的SYN請(qǐng)求中攜帶信息,進(jìn)行快速的握手和通信。
夜雨聆風(fēng)
