深入理解Linux网络技术内幕——L4层协议与Raw IP的处理

我们简单了解下L4层协议和Raw IP是如何与IP层进行交互的。

L4层协议

L4层协议可以通过静态编译和模块配置两种方式加入内核。

比较重要的协议如TCP、UDP、ICMP通常是静态编译至内核。

一些不常用的或者比较特殊的协议，则是通过内核配置加入内核。如IGMP，SCTP，IPIP等等。

L4层协议的注册

L4层协议有net_protocol结构定义：

/* This is used to register protocols. */
struct net_protocol {
    int         (*handler)(struct sk_buff *skb); //由协议注册的，用于处理入口封包的函数
    //由ICMP协议处理函数所用的函数，当收到ICMP UNREACHABLE 时，通知L4层
    void            (*err_handler)(struct sk_buff *skb, u32 info);
    int         (*gso_send_check)(struct sk_buff *skb);
    struct sk_buff         *(*gso_segment)(struct sk_buff *skb,
                           int features);
    struct sk_buff        **(*gro_receive)(struct sk_buff **head,
                           struct sk_buff *skb);
    int         (*gro_complete)(struct sk_buff *skb);
    unsigned int        no_policy:1, //使协议免于IPsec检查
                netns_ok:1;
};

协议会以inet_add_protocol进行注册，如果协议是以模块形式存在，可以通过inet_del_protocol进行除名（模块才有除名函数）

/*
 *  Add a protocol handler to the hash tables
 */

int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
    int hash, ret;

    hash = protocol & (MAX_INET_PROTOS - 1); 

    spin_lock_bh(&inet_proto_lock);
    if (inet_protos[hash]) {
        ret = -1;
    } else {
        inet_protos[hash] = prot;
        ret = 0;
    }
    spin_unlock_bh(&inet_proto_lock);

    return ret;
}
/*
 *  Remove a protocol from the hash tables.
 */

int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
    int hash, ret;

    hash = protocol & (MAX_INET_PROTOS - 1);

    spin_lock_bh(&inet_proto_lock);
    if (inet_protos[hash] == prot) {
        inet_protos[hash] = NULL;
        ret = 0;
    } else {
        ret = -1;
    }
    spin_unlock_bh(&inet_proto_lock);

    synchronize_net();

    return ret;
}

EXPORT_SYMBOL(inet_add_protocol);
EXPORT_SYMBOL(inet_del_protocol);

我们看下具体的例子：

下面是TCP、UDP、ICMP协议结构体的初始化，这要先完成：

static const struct net_protocol tcp_protocol = {
    .handler =  tcp_v4_rcv,
    .err_handler =  tcp_v4_err,
    .gso_send_check = tcp_v4_gso_send_check,
    .gso_segment =  tcp_tso_segment,
    .gro_receive =  tcp4_gro_receive,
    .gro_complete = tcp4_gro_complete,
    .no_policy =    1,
    .netns_ok = 1,
};

static const struct net_protocol udp_protocol = {
    .handler =  udp_rcv,
    .err_handler =  udp_err,
    .gso_send_check = udp4_ufo_send_check,
    .gso_segment = udp4_ufo_fragment,
    .no_policy =    1,
    .netns_ok = 1,
};

static const struct net_protocol icmp_protocol = {
    .handler =  icmp_rcv,
    .no_policy =    1,
    .netns_ok = 1,
};

协议结构体初始化结束后，在inet_init中，在把各个协议加入内核。

static int __init inet_init(void)
{
    struct sk_buff *dummy_skb;
    struct inet_protosw *q;
    struct list_head *r;
    int rc = -EINVAL;

    BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));

    rc = proto_register(&tcp_prot, 1);
    if (rc)
        goto out;

    rc = proto_register(&udp_prot, 1);
    if (rc)
        goto out_unregister_tcp_proto; 

    rc = proto_register(&raw_prot, 1);
    if (rc)
        goto out_unregister_udp_proto; 

    /*
     *  Tell SOCKET that we are alive...
     */

    (void)sock_register(&inet_family_ops);
#ifdef CONFIG_SYSCTL
    ip_static_sysctl_init();
#endif

    /*
     *  Add all the base protocols.
     */

    if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
        printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
    if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
        printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
    if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
        printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
#ifdef CONFIG_IP_MULTICAST
    if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
        printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
#endif

    /* Register the socket-side information for inet_create. */
    for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
        INIT_LIST_HEAD(r);

    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
        inet_register_protosw(q);

    /*
     *  Set the ARP module up
     */

    arp_init();

    /*
     *  Set the IP module up
     */

    ip_init();

    tcp_v4_init();

    /* Setup TCP slab cache for open requests. */
    tcp_init();

    /* Setup UDP memory threshold */
    udp_init();

    /* Add UDP-Lite (RFC 3828) */
    udplite4_register();
    /*
     *  Set the ICMP layer up
     */

    if (icmp_init() < 0)
        panic("Failed to create the ICMP control socket.\n");

    /*
     *  Initialise the multicast router
     */
#if defined(CONFIG_IP_MROUTE)
    if (ip_mr_init())
        printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
#endif
    /*
     *  Initialise per-cpu ipv4 mibs
     */

    if (init_ipv4_mibs())
        printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");

    ipv4_proc_init();

    ipfrag_init();

    dev_add_pack(&ip_packet_type);

    rc = 0;
out:
    return rc;
out_unregister_udp_proto:
    proto_unregister(&udp_prot);
out_unregister_tcp_proto:
    proto_unregister(&tcp_prot);
    goto out;
}

从代码中可以看出，TCP、UDP、ICMP等是直接静态编译至内核的。而IGMP只有内核配置了组播才会以模块的方式加入内核。

L3到L4的封包传递：ip_local_deliver_finish

Raw套接字和Raw IP

我们要知道，并不是所有的L4层处理，都是在内核实现的。应用程序可以通过Raw 套接字和Raw IP跳过L4层协议，直接与IP层进行交互。

时间： 2024-11-10 15:30:25

深入理解Linux网络技术内幕——L4层协议与Raw IP的处理的相关文章

深入理解Linux网络技术内幕——PCI层和网络接口卡

概述内核的PCI子系统(即PCI层)提供了不同设备一些通用的功能,以便简化各种设备驱动程序. PCI层重要结构体如下: pci_device_id 设备标识,根据PCI标志定义的ID,而不是Linux本地的. pci_dev 类似于网络设备的net_device.每个PCI会被分配一个net_dev实例. pci_driver PCI层和设备驱动程序之间的接口.主要由一些函数指针组成.如下所示: struct pci_driver { struct list_head node; char *

深入理解Linux网络技术内幕——IPv4 报文的传输发送

报文传输,指的是报文离开本机,发往其他系统的过程. 传输可以由L4层协议发起,也可以由报文转发发起. 在深入理解Linux网络技术内幕--IPv4 报文的接收(转发与本地传递)一文中,我们可以看到,报文转发最后会调用dst_output与邻居子系统进行交互,然后传给设备驱动程序. 这里,我们从L4层协议发起的传输,最后也会经历这一过程(调用dst_output).本文讨论的是L4层协议发起的传输,在IPv4协议处理(IP层)中的一些环节. 大蓝图我们先看下传输环节的大蓝图,以便对传输这一过程有

深入理解Linux网络技术内幕——IPv4 概念

1.大蓝图大蓝图展示了IPv4协议与其他子系统之间的联系,保罗设备驱动.Netfilter.L4 层协议等之间的互动. IPv4协议中的报文我们可以大致看出数据在IPv4协议中的流向, 接收报文设备驱动处理完硬件介绍到的数据后,IPv4协议的ip_rcv函数(net_receive_skb调用)得到了属于IPv4的报文,接着调用ip_rcv_finish对报文进行分析.判断是该转发还是交付本地上层协议. 如果是本地报文,则传给ip_local_deliver处理,如果是转发,那就交付ip_

深入理解Linux网络技术内幕——IPv4 报文的接收（转发与本地传递）

我们知道,报文经过网卡驱动处理后,调用net_receive_skb传递给具体的协议处理函数,对于IPv4报文来说,其协议处理函数就是ip_rcv了,ip_rcv在进行一些健康检查等操作后,会调用ip_rcv_finish来处理报文.这也是IPv4协议对报文接收处理的开始. 我们先看下ip_rcv_finish源代码: ip_rcv_finish: //ip数据报文的主要处理程序(ip_rcv仅仅只是对ip数据报做一些健康性检查) //ip_rcv_finish 其实是进行路由表查询,,决定报文

深入理解Linux网络技术内幕——路由子系统的概念与高级路由

本文讨论IPv4的路由子系统.(IPv6对路由的处理不同). 基本概念路由子系统工作在三层,用来转发入口流量. 路由子系统主要设计路由器.路由.路由表等概念. 路由器: 配备多个网络接口卡(NIC),并且能利用自身网络信息进行入口流量转发的设备. 路由: 流量转发,决定目的地的过程路由表:转发信息库,该库中储存路由需要本地接收还是转发的信息, 以及转发流量时所需要的信息.(即,信息库用来判断,要不要转发,如果要转发,向哪里转发). 我们了解,路由器有多个网卡,但是多个NIC的设备不一定就是

《深入理解Linux网络技术内幕》阅读笔记 --- 路由

一.Linux内核中路由相关的主要数据结构 struct fib_result:对路由表查找后返回该结构,它的内容并不是简单的包含下一跳信息,而且包含其他特性,例如策略路由所需的更多参数. struct fib_rule:表示由策略路由在路由流量时选择路由表的规则 struct fib_node:一条路由表项.例如,该数据结构用于存储由route add或ip route add命令添加一条路由时生成的信息. struct fn_zone:一个zone表示子网掩码长度相同的一组路由 struct

深入理解Linux网络技术内幕——帧的接收与传输

帧的接收 NAPI与netif_rx(非NAPI) Linux内核获取网络帧到达通知的方式有两中:中断和轮询.(中断值设备向内核发出中断,轮询指linux内核主动轮询设备) 在早起的linux内核中,网络帧主要以中断的方式通知linux内核帧的到达.这是非NAPI方式. 现在的操作系统中,linux内核使用NAPI方式, 获取帧到达的消息.NAPI混合使用了中断和轮询. netif_rx(非NAPI): 每一个帧接收完毕时,设备向内核发送一个中断.(在低流量负载的情况下,这种方式对比轮询优势明显

深入理解Linux网络技术内幕——网络设备初始化

概述内核的初始化过程过程中,与网络相关的工作如下所示: 内核引导时执行start_kernel,start_kernel结束之前会调用rest_init,rest_init初始化内核线程init(在Linux3-12中为kernel_init). asmlinkage void __init start_kernel(void) { ... parse_early_param();//间接调用parse_args parse_args(...); //处理内核引导程序(boot loader)

深入理解Linux网络技术内幕——IPv4 分段与重组

封包的分段和重组是IP协议最重要的工作之一. IPv4报头中有一个len字段(用于表示报文的总长度,单位:字节)占16bit,因此,封包的最大尺寸定义为64K,(2^16/1024=64). 但是,在实际网络传输中,没有几个网络接口能够传输64K这么大的封包,而是有一个MTU表示其最大传输单元.这样,当要传输的封包大于MTU时,就需要对封包进行分段. 这里需要说明,我们指的MTU,不仅仅是出口设备的MTU,它取决于很多因素,如路由表项所用的MTU.出口设备的MTU等. 我们先不用过多与关注MTU