sk_buff结构用来描述已接收或者待发送的数据报文信息;skb在不同网络协议层之间传递,可被用于不同网络协议,如二层的mac或其他链路层协议,三层的ip,四层的tcp或者udp协议,其中某些成员变量会在该结构从一层向另一层传递时发生改变,从上层向下层传递需要添加首部,从下层向上层传递需要移除首部;
1 /* skb头结构 */ 2 struct sk_buff_head { 3 /* These two members must be first. */ 4 /* 通过下面两个指针成员将skb连接成双向链表 */ 5 struct sk_buff *next; /* 指向后一个skb */ 6 struct sk_buff *prev; /* 指向前一个skb */ 7 8 __u32 qlen; /* 链表中元素个数 */ 9 spinlock_t lock; /* 自旋锁 */ 10 };
1 /** 2 * struct sk_buff - socket buffer 3 * @next: Next buffer in list 4 * @prev: Previous buffer in list 5 * @tstamp: Time we arrived/left 6 * @rbnode: RB tree node, alternative to next/prev for netem/tcp 7 * @sk: Socket we are owned by 8 * @dev: Device we arrived on/are leaving by 9 * @cb: Control buffer. Free for use by every layer. Put private vars here 10 * @_skb_refdst: destination entry (with norefcount bit) 11 * @sp: the security path, used for xfrm 12 * @len: Length of actual data 13 * @data_len: Data length 14 * @mac_len: Length of link layer header 15 * @hdr_len: writable header length of cloned skb 16 * @csum: Checksum (must include start/offset pair) 17 * @csum_start: Offset from skb->head where checksumming should start 18 * @csum_offset: Offset from csum_start where checksum should be stored 19 * @priority: Packet queueing priority 20 * @ignore_df: allow local fragmentation 21 * @cloned: Head may be cloned (check refcnt to be sure) 22 * @ip_summed: Driver fed us an IP checksum 23 * @nohdr: Payload reference only, must not modify header 24 * @pkt_type: Packet class 25 * @fclone: skbuff clone status 26 * @ipvs_property: skbuff is owned by ipvs 27 * @tc_skip_classify: do not classify packet. set by IFB device 28 * @tc_at_ingress: used within tc_classify to distinguish in/egress 29 * @tc_redirected: packet was redirected by a tc action 30 * @tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect 31 * @peeked: this packet has been seen already, so stats have been 32 * done for it, don‘t do them again 33 * @nf_trace: netfilter packet trace flag 34 * @protocol: Packet protocol from driver 35 * @destructor: Destruct function 36 * @_nfct: Associated connection, if any (with nfctinfo bits) 37 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c 38 * @skb_iif: ifindex of device we arrived on 39 * @tc_index: Traffic control index 40 * @hash: the packet hash 41 * @queue_mapping: Queue mapping for multiqueue devices 42 * @xmit_more: More SKBs are pending for this queue 43 * @ndisc_nodetype: router type (from link layer) 44 * @ooo_okay: allow the mapping of a socket to a queue to be changed 45 * @l4_hash: indicate hash is a canonical 4-tuple hash over transport 46 * ports. 47 * @sw_hash: indicates hash was computed in software stack 48 * @wifi_acked_valid: wifi_acked was set 49 * @wifi_acked: whether frame was acked on wifi or not 50 * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS 51 * @dst_pending_confirm: need to confirm neighbour 52 * @napi_id: id of the NAPI struct this skb came from 53 * @secmark: security marking 54 * @mark: Generic packet mark 55 * @vlan_proto: vlan encapsulation protocol 56 * @vlan_tci: vlan tag control information 57 * @inner_protocol: Protocol (encapsulation) 58 * @inner_transport_header: Inner transport layer header (encapsulation) 59 * @inner_network_header: Network layer header (encapsulation) 60 * @inner_mac_header: Link layer header (encapsulation) 61 * @transport_header: Transport layer header 62 * @network_header: Network layer header 63 * @mac_header: Link layer header 64 * @tail: Tail pointer 65 * @end: End pointer 66 * @head: Head of buffer 67 * @data: Data head pointer 68 * @truesize: Buffer size 69 * @users: User count - see {datagram,tcp}.c 70 */ 71 /* skb结构 */ 72 struct sk_buff { 73 union { 74 struct { 75 /* These two members must be first. */ 76 struct sk_buff *next; 77 struct sk_buff *prev; 78 79 /* 报文到达或者离开的时间戳 */ 80 union { 81 ktime_t tstamp; 82 struct skb_mstamp skb_mstamp; 83 }; 84 }; 85 struct rb_node rbnode; /* used in netem & tcp stack */ 86 }; 87 88 /* 89 指向缓冲区的套接字sock数据结构。当数据在本地产生或者正由本地进程接收时, 90 该数据以及套接字相关信息会被L4(tcp或者udp)以及用户应用程序使用 91 当缓冲区只是被转发时(本地机器不是来源也不是目的地),该指针为NULL 92 */ 93 struct sock *sk; 94 95 union { 96 /* 报文到达或者离开时的网络设备 */ 97 struct net_device *dev; 98 /* Some protocols might use this space to store information, 99 * while device pointer would be NULL. 100 * UDP receive path is one user. 101 */ 102 unsigned long dev_scratch; 103 }; 104 /* 105 * This is the control buffer. It is free to use for every 106 * layer. Please put your private variables there. If you 107 * want to keep them across layers you have to do a skb_clone() 108 * first. This is owned by whoever has the skb queued ATM. 109 */ 110 /* 111 控制缓冲区,用于存储私有信息,每层协议自己维护并使用, 112 并且只在本层有有效 113 */ 114 char cb[48] __aligned(8); 115 116 /* 路由缓存,输入或者输出报文都要查询到目的路由缓存项,才能确定流向 */ 117 unsigned long _skb_refdst; 118 119 /* 120 当缓冲区被删除时,可以完成某些清理工作 121 当缓冲区不属于一个套接字时,该函数通常不被初始化 122 属于一个套接字时,通常设置为sock_rfree或sock_wfree 123 sock_xxx函数用于更新套接字队列中所持有的内存 124 */ 125 void (*destructor)(struct sk_buff *skb); 126 #ifdef CONFIG_XFRM 127 struct sec_path *sp; 128 #endif 129 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 130 unsigned long _nfct; 131 #endif 132 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 133 struct nf_bridge_info *nf_bridge; 134 #endif 135 /* 136 缓冲区的数据区块大小,该长度包括主缓冲区(head指针指向)的数据 137 以及一些片段(fragment)的数据,当缓冲区从一个网络分层移动到下一个 138 网络分层时,该值会发生变化,因为在协议栈中向上层移动时报头会被丢弃 139 向下层移动时报头会添加,len也会把协议报头算在内,与"数据预留和对齐"操作 140 */ 141 unsigned int len, 142 /* 片段(fragment)中的数据大小 */ 143 data_len; 144 /* mac报头大小 */ 145 __u16 mac_len, 146 /* 克隆skb时可写报文头部长度 */ 147 hdr_len; 148 149 /* Following fields are _not_ copied in __copy_skb_header() 150 * Note that queue_mapping is here mostly to fill a hole. 151 */ 152 kmemcheck_bitfield_begin(flags1); 153 __u16 queue_mapping; 154 155 /* if you move cloned around you also must adapt those constants */ 156 #ifdef __BIG_ENDIAN_BITFIELD 157 #define CLONED_MASK (1 << 7) 158 #else 159 #define CLONED_MASK 1 160 #endif 161 #define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) 162 163 __u8 __cloned_offset[0]; 164 /* 表示该skb是另外一个skb的克隆 */ 165 __u8 cloned:1, 166 /* 167 payload是否被单独引用,不存在协议首部,如果被引用,则不能修改协议首部,也不能通过skb->data来访问协议首部 168 */ 169 nohdr:1, 170 /* 171 当前克隆状态 172 SKB_FCLONE_UNAVAILABLE-skb未被克隆 173 SKB_FCLONE_ORIG-在skbuff_fclone_cache分配的父skb,可以被克隆 174 SKB_FCLONE_CLONE-在skbuff_fclone_cache分配的子skb,从父skb克隆得到 175 */ 176 fclone:2, 177 peeked:1, 178 head_frag:1, 179 xmit_more:1, 180 __unused:1; /* one bit hole */ 181 kmemcheck_bitfield_end(flags1); 182 183 /* fields enclosed in headers_start/headers_end are copied 184 * using a single memcpy() in __copy_skb_header() 185 */ 186 /* private: */ 187 __u32 headers_start[0]; 188 /* public: */ 189 190 /* if you move pkt_type around you also must adapt those constants */ 191 #ifdef __BIG_ENDIAN_BITFIELD 192 #define PKT_TYPE_MAX (7 << 5) 193 #else 194 #define PKT_TYPE_MAX 7 195 #endif 196 #define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) 197 198 __u8 __pkt_type_offset[0]; 199 /* 200 此字段根据l2的目的地址进行划分 201 PACKET_HOST-mac地址与接收设备mac地址相等,说明是发给该主机的 202 PACKET_BROADCAST-mac地址是接收设备的广播地址 203 PACKET_MULTICAST-mac地址接收改设备注册的多播地址之一 204 PACKET_OTHERHOST-mac地址不属于接收设备的地址,启用转发则转发,否则丢弃 205 PACKET_OUTGOING-数据包将被发出,用到这个标记的功能包括decnet, 206 或者为每个网络tab都复制一份发出包的函数 207 PACKET_LOOPBACK-数据包发往回环设备,有此标识,处理回环设备时, 208 可以跳过一些真实设备所需的操作 209 PACKET_USER-发送到用户空间,netlink使用 210 PACKET_KERNEL-发送到内核空间,netlink使用 211 PACKET_FASTROUTE-未使用 212 */ 213 __u8 pkt_type:3; 214 __u8 pfmemalloc:1; 215 __u8 ignore_df:1; 216 217 __u8 nf_trace:1; 218 /* 219 CHECKSUM_NONE-硬件不支持,完全由软件执行校验和 220 CHECKSUM_PARTIAL-由硬件来执行校验和 221 CHECKSUM_UNNECESSARY-没必要执行校验和 222 CHECKSUM_COMPLETE-已完成执行校验和 223 */ 224 __u8 ip_summed:2; 225 __u8 ooo_okay:1; 226 __u8 l4_hash:1; 227 __u8 sw_hash:1; 228 __u8 wifi_acked_valid:1; 229 __u8 wifi_acked:1; 230 231 __u8 no_fcs:1; 232 /* Indicates the inner headers are valid in the skbuff. */ 233 __u8 encapsulation:1; 234 __u8 encap_hdr_csum:1; 235 __u8 csum_valid:1; 236 __u8 csum_complete_sw:1; 237 __u8 csum_level:2; 238 __u8 csum_bad:1; 239 240 __u8 dst_pending_confirm:1; 241 #ifdef CONFIG_IPV6_NDISC_NODETYPE 242 __u8 ndisc_nodetype:2; 243 #endif 244 __u8 ipvs_property:1; 245 __u8 inner_protocol_type:1; 246 __u8 remcsum_offload:1; 247 #ifdef CONFIG_NET_SWITCHDEV 248 __u8 offload_fwd_mark:1; 249 #endif 250 #ifdef CONFIG_NET_CLS_ACT 251 __u8 tc_skip_classify:1; 252 __u8 tc_at_ingress:1; 253 __u8 tc_redirected:1; 254 __u8 tc_from_ingress:1; 255 #endif 256 257 #ifdef CONFIG_NET_SCHED 258 __u16 tc_index; /* traffic control index */ 259 #endif 260 261 union { 262 /* 校验和,必须包含csum_start和csum_offset */ 263 __wsum csum; 264 struct { 265 /* 校验开始位置,相对于header */ 266 __u16 csum_start; 267 /* 校验和存储位置,相对于csum_start */ 268 __u16 csum_offset; 269 }; 270 }; 271 /* 272 正在被传输的数据包QoS等级 273 数据包由本地产生,套接字会定义优先级的值 274 数据包在被转发,则在调用ip_forward函数时,会根据 275 ip头本身的ToS字段定义该值 276 */ 277 __u32 priority; 278 int skb_iif; 279 __u32 hash; 280 __be16 vlan_proto; 281 __u16 vlan_tci; 282 #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) 283 union { 284 unsigned int napi_id; 285 unsigned int sender_cpu; 286 }; 287 #endif 288 #ifdef CONFIG_NETWORK_SECMARK 289 __u32 secmark; 290 #endif 291 292 union { 293 __u32 mark; 294 __u32 reserved_tailroom; 295 }; 296 297 /* 封装的协议 */ 298 union { 299 __be16 inner_protocol; 300 __u8 inner_ipproto; 301 }; 302 /* 封装的传输层头部相对于head的偏移 */ 303 __u16 inner_transport_header; 304 /* 封装的网络层头部相对于head的偏移 */ 305 __u16 inner_network_header; 306 /* 封装的链路层头部相对于head的偏移 */ 307 __u16 inner_mac_header; 308 309 /* 310 l3层协议值 311 如ETH_P_IP-ipv4报文 312 ETH_P_ARP-arp报文等 313 */ 314 __be16 protocol; 315 /* 传输层头部相对于head的偏移 */ 316 __u16 transport_header; 317 /* 网络层头部相对于head的偏移 */ 318 __u16 network_header; 319 /* 链路层头部相对于head的偏移 */ 320 __u16 mac_header; 321 322 /* private: */ 323 __u32 headers_end[0]; 324 /* public: */ 325 326 /* These elements must be at the end, see alloc_skb() for details. */ 327 /* 实际数据的尾部 */ 328 sk_buff_data_t tail; 329 /* 缓冲区的尾部 */ 330 sk_buff_data_t end; 331 /* 缓冲区的头部 */ 332 unsigned char *head, 333 /* 实际数据的头部 */ 334 *data; 335 /* 336 缓冲区的总大小,包括skb本身和实际数据len大小,alloc_skb函数将 337 该字段设置为len+sizeof(sk_buff) 338 每当len值更新,该值也要对应更新 339 */ 340 unsigned int truesize; 341 342 /* 343 引用计数,在使用该skb缓冲区的实例个数,当引用计数为0时,skb才能被释放 344 skb_get()获取操作中会增加引用计数,kfree_skb释放过程中检查引用计数, 345 引用计数为0时,才真正释放skb 346 该计数器只计算sk_buff结构引用计数,缓冲区包含的实际数据由 347 skb_shared_info->dataref字段记录 348 */ 349 atomic_t users; 350 };
时间: 2024-10-22 09:29:31