Openvswitch原理与代码分析(5): 内核中的流表flow table操作

?

当一个数据包到达网卡的时候,首先要经过内核Openvswitch.ko,流表Flow Table在内核中有一份,通过key查找内核中的flow table,即可以得到action,然后执行action之后,直接发送这个包,只有在内核无法查找到流表项的时候,才会到用户态查找用户态的流表。仅仅查找内核中flow table的情况被称为fast path.

?

?

第一步:从数据包中提取出key

?

实现函数为int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key)

在这个函数中,首先提取的是物理层的信息,主要是从哪个网口进入的。

  1. key->phy.priority = skb->priority;
  2. key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
  3. key->phy.skb_mark = skb->mark;
  4. ovs_ct_fill_key(skb, key);
  5. key->ovs_flow_hash = 0;
  6. key->recirc_id = 0;

?

然后调用函数static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)提取其他的key

提取MAC层

  1. /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
  2. ?* header in the linear data area.
  3. ?*/
  4. eth = eth_hdr(skb);
  5. ether_addr_copy(key->eth.src, eth->h_source);
  6. ether_addr_copy(key->eth.dst, eth->h_dest);
  7. __skb_pull(skb, 2 * ETH_ALEN);
  8. /* We are going to push all headers that we pull, so no need to
  9. ?* update skb->csum here.
  10. ?*/
  11. key->eth.tci = 0;
  12. if (skb_vlan_tag_present(skb))
  13. ???key->eth.tci = htons(vlan_get_tci(skb));
  14. else
    if (eth->h_proto == htons(ETH_P_8021Q))
  15. ???if (unlikely(parse_vlan(skb, key)))
  16. ??????return -ENOMEM;
  17. key->eth.type = parse_ethertype(skb);

?

提取网络层

  1. struct iphdr *nh;
  2. __be16 offset;
  3. error = check_iphdr(skb);
  4. if (unlikely(error)) {
  5. ???memset(&key->ip, 0, sizeof(key->ip));
  6. ???memset(&key->ipv4, 0, sizeof(key->ipv4));
  7. ???if (error == -EINVAL) {
  8. ??????skb->transport_header = skb->network_header;
  9. ??????error = 0;
  10. ???}
  11. ???return error;
  12. }
  13. nh = ip_hdr(skb);
  14. key->ipv4.addr.src = nh->saddr;
  15. key->ipv4.addr.dst = nh->daddr;
  16. key->ip.proto = nh->protocol;
  17. key->ip.tos = nh->tos;
  18. key->ip.ttl = nh->ttl;
  19. offset = nh->frag_off & htons(IP_OFFSET);
  20. if (offset) {
  21. ???key->ip.frag = OVS_FRAG_TYPE_LATER;
  22. ???return 0;
  23. }
  24. if (nh->frag_off & htons(IP_MF) ||
  25. ???skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
  26. ???key->ip.frag = OVS_FRAG_TYPE_FIRST;
  27. else
  28. ???key->ip.frag = OVS_FRAG_TYPE_NONE;

?

提取传输层

  1. /* Transport layer. */
  2. if (key->ip.proto == IPPROTO_TCP) {
  3. ???if (tcphdr_ok(skb)) {
  4. ??????struct tcphdr *tcp = tcp_hdr(skb);
  5. ??????key->tp.src = tcp->source;
  6. ??????key->tp.dst = tcp->dest;
  7. ??????key->tp.flags = TCP_FLAGS_BE16(tcp);
  8. ???} else {
  9. ??????memset(&key->tp, 0, sizeof(key->tp));
  10. ???}
  11. } else
    if (key->ip.proto == IPPROTO_UDP) {
  12. ???if (udphdr_ok(skb)) {
  13. ??????struct udphdr *udp = udp_hdr(skb);
  14. ??????key->tp.src = udp->source;
  15. ??????key->tp.dst = udp->dest;
  16. ???} else {
  17. ??????memset(&key->tp, 0, sizeof(key->tp));
  18. ???}
  19. } else
    if (key->ip.proto == IPPROTO_SCTP) {
  20. ???if (sctphdr_ok(skb)) {
  21. ??????struct sctphdr *sctp = sctp_hdr(skb);
  22. ??????key->tp.src = sctp->source;
  23. ??????key->tp.dst = sctp->dest;
  24. ???} else {
  25. ??????memset(&key->tp, 0, sizeof(key->tp));
  26. ???}
  27. } else
    if (key->ip.proto == IPPROTO_ICMP) {
  28. ???if (icmphdr_ok(skb)) {
  29. ??????struct icmphdr *icmp = icmp_hdr(skb);
  30. ??????/* The ICMP type and code fields use the 16-bit
  31. ???????* transport port fields, so we need to store
  32. ???????* them in 16-bit network byte order.
  33. ???????*/
  34. ??????key->tp.src = htons(icmp->type);
  35. ??????key->tp.dst = htons(icmp->code);
  36. ???} else {
  37. ??????memset(&key->tp, 0, sizeof(key->tp));
  38. ???}
  39. }

?

第二步:根据key查找flow table

?

调用struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, u32 *n_mask_hit)进行查找。

?

?

在内核中,flow table的数据结构如上图所示。

每个虚拟交换机对应一个datapath,每个datapath有一个flow table,每个flow table分成N个桶,根据key进行哈希,不同的key分布在不同的桶里面。

每个桶的大小是一个内存页的大小,在内存页的头部保存了保存了多少个元素,每个元素的大小。每个元素都是sw_flow,里面有key,也有action。

?

ovs_flow_tbl_lookup_stats会调用static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, const struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *index)

会调用masked_flow_lookup如下

  1. static
    struct sw_flow *masked_flow_lookup(struct table_instance *ti,
  2. ?????????????????const
    struct sw_flow_key *unmasked,
  3. ?????????????????const
    struct sw_flow_mask *mask,
  4. ?????????????????u32 *n_mask_hit)
  5. {
  6. ???struct sw_flow *flow;
  7. ???struct hlist_head *head;
  8. ???u32 hash;
  9. ???struct sw_flow_key masked_key;
  10. ?
  11. ???ovs_flow_mask_key(&masked_key, unmasked, false, mask);
  12. ???hash = flow_hash(&masked_key, &mask->range);
  13. ???head = find_bucket(ti, hash);
  14. ???(*n_mask_hit)++;
  15. ???hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
  16. ??????if (flow->mask == mask && flow->flow_table.hash == hash &&
  17. ??????????flow_cmp_masked_key(flow, &masked_key, &mask->range))
  18. ?????????return flow;
  19. ???}
  20. ???return NULL;
  21. }

?

其中flow_hash计算哈希值,find_bucket根据哈希值查找桶,然后就是一个循环,逐个比较key是否相等,相等则返回flow。

?

第三步:执行action

?

调用int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts,struct sw_flow_key *key)

调用static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len)

在这个函数中,通过case语句,不同的action进行不同的操作。

  1. static
    int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
  2. ???????????????struct sw_flow_key *key,
  3. ???????????????const
    struct nlattr *attr, int len)
  4. {
  5. ???/* Every output action needs a separate clone of ‘skb‘, but the common
  6. ????* case is just a single output action, so that doing a clone and
  7. ????* then freeing the original skbuff is wasteful. So the following code
  8. ????* is slightly obscure just to avoid that.
  9. ????*/
  10. ???int prev_port = -1;
  11. ???const
    struct nlattr *a;
  12. ???int rem;
  13. ?
  14. ???for (a = attr, rem = len; rem > 0;
  15. ????????a = nla_next(a, &rem)) {
  16. ??????int err = 0;
  17. ?
  18. ??????if (unlikely(prev_port != -1)) {
  19. ?????????struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
  20. ?
  21. ?????????if (out_skb)
  22. ????????????do_output(dp, out_skb, prev_port, key);
  23. ?
  24. ?????????prev_port = -1;
  25. ??????}
  26. ?
  27. ??????switch (nla_type(a)) {
  28. ??????case OVS_ACTION_ATTR_OUTPUT:
  29. ?????????prev_port = nla_get_u32(a);
  30. ?????????break;
  31. ?
  32. ??????case OVS_ACTION_ATTR_USERSPACE:
  33. ?????????output_userspace(dp, skb, key, a, attr, len);
  34. ?????????break;
  35. ?
  36. ??????case OVS_ACTION_ATTR_HASH:
  37. ?????????execute_hash(skb, key, a);
  38. ?????????break;
  39. ?
  40. ??????case OVS_ACTION_ATTR_PUSH_MPLS:
  41. ?????????err = push_mpls(skb, key, nla_data(a));
  42. ?????????break;
  43. ?
  44. ??????case OVS_ACTION_ATTR_POP_MPLS:
  45. ?????????err = pop_mpls(skb, key, nla_get_be16(a));
  46. ?????????break;
  47. ?
  48. ??????case OVS_ACTION_ATTR_PUSH_VLAN:
  49. ?????????err = push_vlan(skb, key, nla_data(a));
  50. ?????????break;
  51. ?
  52. ??????case OVS_ACTION_ATTR_POP_VLAN:
  53. ?????????err = pop_vlan(skb, key);
  54. ?????????break;
  55. ?
  56. ??????case OVS_ACTION_ATTR_RECIRC:
  57. ?????????err = execute_recirc(dp, skb, key, a, rem);
  58. ?????????if (nla_is_last(a, rem)) {
  59. ????????????/* If this is the last action, the skb has
  60. ?????????????* been consumed or freed.
  61. ?????????????* Return immediately.
  62. ?????????????*/
  63. ????????????return err;
  64. ?????????}
  65. ?????????break;
  66. ?
  67. ??????case OVS_ACTION_ATTR_SET:
  68. ?????????err = execute_set_action(skb, key, nla_data(a));
  69. ?????????break;
  70. ?
  71. ??????case OVS_ACTION_ATTR_SET_MASKED:
  72. ??????case OVS_ACTION_ATTR_SET_TO_MASKED:
  73. ?????????err = execute_masked_set_action(skb, key, nla_data(a));
  74. ?????????break;
  75. ?
  76. ??????case OVS_ACTION_ATTR_SAMPLE:
  77. ?????????err = sample(dp, skb, key, a, attr, len);
  78. ?????????break;
  79. ?
  80. ??????case OVS_ACTION_ATTR_CT:
  81. ?????????if (!is_flow_key_valid(key)) {
  82. ????????????err = ovs_flow_key_update(skb, key);
  83. ????????????if (err)
  84. ???????????????return err;
  85. ?????????}
  86. ?
  87. ?????????err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
  88. ????????????????????nla_data(a));
  89. ?
  90. ?????????/* Hide stolen IP fragments from user space. */
  91. ?????????if (err)
  92. ????????????return err == -EINPROGRESS ? 0 : err;
  93. ?????????break;
  94. ??????}
  95. ?
  96. ??????if (unlikely(err)) {
  97. ?????????kfree_skb(skb);
  98. ?????????return err;
  99. ??????}
  100. ???}
  101. ?
  102. ???if (prev_port != -1)
  103. ??????do_output(dp, skb, prev_port, key);
  104. ???else
  105. ??????consume_skb(skb);
  106. ?
  107. ???return 0;
  108. }

?

如果可以直接输出,则调用static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, struct sw_flow_key *key)他调用void ovs_vport_send(struct vport *vport, struct sk_buff *skb)进行发送。

时间: 2024-10-25 02:39:39

Openvswitch原理与代码分析(5): 内核中的流表flow table操作的相关文章

Openvswitch原理与代码分析(6):用户态流表flow table的操作

当内核无法查找到流表项的时候,则会通过upcall来调用用户态ovs-vswtichd中的flow table. 会调用ofproto-dpif-upcall.c中的udpif_upcall_handler函数. static void * udpif_upcall_handler(void *arg) { ????struct handler *handler = arg; ????struct udpif *udpif = handler->udpif; ? ????while (!latc

Openvswitch原理与代码分析(7): 添加一条流表flow

添加一个flow,调用的命令为 ovs-ofctl add-flow hello "hard_timeout=0 idle_timeout=0 priority=1 table=21 pkt_mark=0x55 tun_id=0x55 actions=mod_nw_dst:192.168.56.101,output:2" 这里调用的是调用ovs/utilities/ovs-ofctl.c的命令行工具 这个命令行工具支持的所有的命令及处理函数定义如下: static const stru

Openvswitch原理与代码分析(4):网络包的处理过程

? 在上一节提到,Openvswitch的内核模块openvswitch.ko会在网卡上注册一个函数netdev_frame_hook,每当有网络包到达网卡的时候,这个函数就会被调用. ? static struct sk_buff *netdev_frame_hook(struct sk_buff *skb) { ???if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) ??????return skb; ? ???port_receive(skb)

Openvswitch原理与代码分析(1):总体架构

一.Opevswitch总体架构 Openvswitch的架构网上有如下的图表示: 每个模块都有不同的功能 ovs-vswitchd 为主要模块,实现交换机的守护进程daemon 在Openvswitch所在的服务器进行ps aux可以看到以下的进程 root 1008 0.1 0.8 242948 31712 ? S<Ll Aug06 32:17 ovs-vswitchd unix:/var/run/openvswitch/db.sock -vconsole:emer -vsyslog:err

Openvswitch原理与代码分析(3): openvswitch内核模块的加载

在datapath/datapath.c中会调用module_init(dp_init);来初始化内核模块. static int __init dp_init(void){   int err;    BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));    pr_info("Open vSwitch switching datapath %s\n", VERSION);    

Openvswitch原理与代码分析(2): ovs-vswitchd的启动

ovs-vswitchd.c的main函数最终会进入一个while循环,在这个无限循环中,里面最重要的两个函数是bridge_run()和netdev_run(). ? ? Openvswitch主要管理两种类型的设备,一个是创建的虚拟网桥,一个是连接到虚拟网桥上的设备. ? 其中bridge_run就是初始化数据库中已经创建的虚拟网桥. ? 一.虚拟网桥的初始化bridge_run ? bridge_run会调用bridge_run__,bridge_run__中最重要的是对于所有的网桥,都调

Openvswitch原理与代码分析(8): 修改Openvswitch代码添加自定义action

有时候我们需要自定义一些自己的action,根据包头里面的信息,做一些自己的操作. ? 例如添加一个action名为handle_example ? 第一.修改ofp-actions.c文件 ? 首先在ofp-actions.c里面添加Openflow各个版本的这个action static const struct ofpact_map * get_ofpact_map(enum ofp_version version) { ????/* OpenFlow 1.0 actions. */ ??

免费的Lucene 原理与代码分析完整版下载

Lucene是一个基于Java的高效的全文检索库.那么什么是全文检索,为什么需要全文检索?目前人们生活中出现的数据总的来说分为两类:结构化数据和非结构化数据.很容易理解,结构化数据是有固定格式和结构的或者有限长度的数据,比如数据库,元数据等.非结构化数据则是不定长或者没有固定格式的数据,如图片,邮件,文档等.还有一种较少的分类为半结构化数据,如XML,HTML等,在一定程度上我们可以将其按照结构化数据来处理,也可以抽取纯文本按照非结构化数据来处理.非结构化数据又称为全文数据.,对其搜索主要有两种

Lua中的weak表——weak table

弱表(weak table)是一个很有意思的东西,像C++/Java等语言是没有的.弱表的定义是:A weak table is a table whose elements are weak references,元素为弱引用的表就叫弱表.有弱引用那么也就有强引用,有引用那么也就有非引用.我们先要厘这些基本概念:变量.值.类型.对象. (1)变量与值:Lua是一个dynamically typed language,也就是说在Lua中,变量没有类型,它可以是任何东西,而值有类型,所以Lua中没