每个CPU有一个单独的softnet_data实例,用来存储与网络中断处理相关的报文输出和输出队列。在输出过程中会用到softnet_data中的output_queue和completion_queue队列。
/* * Incoming packets are placed on per-cpu queues so that * no locking is needed. */ struct softnet_data { struct Qdisc *output_queue; struct sk_buff_head input_pkt_queue; struct list_head poll_list; struct sk_buff *completion_queue; struct napi_struct backlog; };
struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *dev); struct sk_buff * (*dequeue)(struct Qdisc *dev); unsigned flags; #define TCQ_F_BUILTIN 1 #define TCQ_F_THROTTLED 2 #define TCQ_F_INGRESS 4 #define TCQ_F_CAN_BYPASS 8 #define TCQ_F_MQROOT 16 #define TCQ_F_WARN_NONWC (1 << 16) int padded; struct Qdisc_ops *ops; struct qdisc_size_table *stab; struct list_head list; u32 handle; u32 parent; atomic_t refcnt; struct gnet_stats_rate_est rate_est; int (*reshape_fail)(struct sk_buff *skb, struct Qdisc *q); void *u32_node; /* This field is deprecated, but it is still used by CBQ * and it will live until better solution will be invented. */ struct Qdisc *__parent; struct netdev_queue *dev_queue; struct Qdisc *next_sched; struct sk_buff *gso_skb; /* * For performance sake on SMP, we put highly modified fields at the end */ unsigned long state; struct sk_buff_head q; struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; };
softnet与接口层和网络层的关系(输出角度)
输出接口
dev_queue_xmit()
dev_queue_xmit()流程大致为,如果支持流量控制,则将待输出的数据包根据规则加入到输出网络设备队列中排队,并在合适的实际激活网络输出软中断,依次将报文从队列中取出通过网络设备输出。若不支持流量控制,则直接将数据包通过网络设备输出
如果提交失败,则返回相应的错误码,然而返回成功也并不能确保数据被成功发送,因为有可能由于拥塞而导致流量控制机制将数据包丢弃
调用dev_queue_xmit()函数输出数据包,前提是必须启用中断,只有启用中断之后才能激活下半部分
/** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling * this function. The function can be called from an interrupt. * * A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. * * ----------------------------------------------------------------------------------- * I notice this method can also return errors from the queue disciplines, * including NET_XMIT_DROP, which is a positive value. So, errors can also * be positive. * * Regardless of the return value, the skb is consumed, so it is currently * difficult to retry a send to this method. (You can bump the ref count * before sending to hold a reference for retry if you are careful.) * * When calling this method, interrupts MUST be enabled. This is because * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ int dev_queue_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; /* GSO will handle the following emulations directly. */ if (netif_needs_gso(dev, skb)) goto gso; if (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST) && __skb_linearize(skb)) goto out_kfree_skb; /* Fragmented skb is linearized if device does not support SG, * or if at least one of fragments is in highmem and device * does not support DMA from it. */ if (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not support * checksumming for this protocol, complete checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_set_transport_header(skb, skb->csum_start - skb_headroom(skb)); if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb)) goto out_kfree_skb; } gso: /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); txq = dev_pick_tx(dev, skb); q = rcu_dereference(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); #endif if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... Really, it is unlikely that netif_tx_lock protection is necessary here. (f.e. loopback and IP tunnels are clean ignoring statistics counters.) However, it is possible, that they rely on protection made by us here. Check this and shot the lock. It is not prone from deadlocks. Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ if (txq->xmit_lock_owner != cpu) { HARD_TX_LOCK(dev, txq, cpu); if (!netif_tx_queue_stopped(txq)) { rc = NET_XMIT_SUCCESS; if (!dev_hard_start_xmit(skb, dev, txq)) { HARD_TX_UNLOCK(dev, txq); goto out; } } HARD_TX_UNLOCK(dev, txq); if (net_ratelimit()) printk(KERN_CRIT "Virtual device %s asks to " "queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, * unfortunately */ if (net_ratelimit()) printk(KERN_CRIT "Dead loop on virtual device " "%s, fix it urgently!\n", dev->name); } } rc = -ENETDOWN; rcu_read_unlock_bh(); out_kfree_skb: kfree_skb(skb); return rc; out: rcu_read_unlock_bh(); return rc; }
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; int rc; if (likely(!skb->next)) { if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); if (netif_needs_gso(dev, skb)) { if (unlikely(dev_gso_segment(skb))) goto out_kfree_skb; if (skb->next) goto gso; } /* * If device doesnt need skb->dst, release it right now while * its hot in this cpu cache */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); rc = ops->ndo_start_xmit(skb, dev); if (rc == NETDEV_TX_OK) txq_trans_update(txq); /* * TODO: if skb_orphan() was called by * dev->hard_start_xmit() (for example, the unmodified * igb driver does that; bnx2 doesn't), then * skb_tx_software_timestamp() will be unable to send * back the time stamp. * * How can this be prevented? Always create another * reference to the socket before calling * dev->hard_start_xmit()? Prevent that skb_orphan() * does anything in dev->hard_start_xmit() by clearing * the skb destructor before the call and restoring it * afterwards, then doing the skb_orphan() ourselves? */ return rc; } gso: do { struct sk_buff *nskb = skb->next; skb->next = nskb->next; nskb->next = NULL; /* * If device doesnt need nskb->dst, release it right now while * its hot in this cpu cache */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc != NETDEV_TX_OK)) { nskb->next = skb->next; skb->next = nskb; return rc; } txq_trans_update(txq); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); skb->destructor = DEV_GSO_CB(skb)->destructor; out_kfree_skb: kfree_skb(skb); return NETDEV_TX_OK; }
e100_xmit_frame()为e100网络设备的ndo_start_xmit接口实现,最终将数据包输出到硬件。
网络输出软中断
激活数据包输出软中断有多个接口,__netif_schedule()是最常用的几个
static inline void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = &__get_cpu_var(softnet_data); q->next_sched = sd->output_queue; sd->output_queue = q; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } void __netif_schedule(struct Qdisc *q) { if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); }
如果输出网络设备没有处于流量控制的调度中,则将该网络设备链接到softnet_data中的output_queue队列上,然后激活网络输出软中断对该队列进行处理
net_tx_action()
net_tx_action()是数据包输出软中断例程,一旦被激活便会遍历output_queue队列待处理的输出网络设备,然后调用qdisc_run()在合适的时机发送数据包。
static void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = &__get_cpu_var(softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_disable(); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_enable(); while (clist) { struct sk_buff *skb = clist; clist = clist->next; WARN_ON(atomic_read(&skb->users)); __kfree_skb(skb); } } if (sd->output_queue) { struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; local_irq_enable(); while (head) { struct Qdisc *q = head; spinlock_t *root_lock; head = head->next_sched; root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { smp_mb__before_clear_bit(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); spin_unlock(root_lock); } else { if (!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)) { __netif_reschedule(q); } else { smp_mb__before_clear_bit(); clear_bit(__QDISC_STATE_SCHED, &q->state); } } } } }
TSO/GSO
TSO(TCP Segmentation Offload)
通过网络设备上的专用处理器处理部分或者全部的封包,借此来降低对于系统处理器资源的占用。
TSO是通过网络设备进行TCP段分割,从而来提高网络性能的一种技术。较大的数据包可以使用该技术,使操作系统减少必须处理的数据数量以提高性能。
通常,当请求大量数据时,TCP发送方必须将数据拆分为MSS大小的数据块,然后进一步将其封装为数据包的形式,以便最终可以在网络中进行传输。而当启用了TSO技术之后,TCP发送方可以将数据拆分为MSS整数倍大小的数据块,然后将大块数据的分段直接交给网络设备处理,操作系统需要创建并传输的数据包数量更少,因此性能会有较大的提高。
TSO只是针对TCP协议的。使TCP协议在硬件上得到有力的支持,事实上,这种感念也可以应用于其他的传输层协议,如TCPv6,UDP等,这就是GSO(Generic Segmentation Offload)
接口层报文输出