From 55574bb0d21541c13fe67545a74448b36063e461 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 3 Aug 2010 14:40:29 -0700 Subject: [PATCH] datapath: Detect and suppress flows that are implicated in loops. In-kernel loops need to be suppressed; otherwise, they cause high CPU consumption, even to the point that the machine becomes unusable. Ideally these flows should never be added to the Open vSwitch flow table, but it is fairly easy for a buggy controller to create them given the menagerie of tunnels, patches, etc. that OVS makes available. Commit ecbb6953b "datapath: Add loop checking" did the initial work toward suppressing loops, by dropping packets that recursed more than 5 times. This at least prevented the kernel stack from overflowing and thereby OOPSing the machine. But even with this commit, it is still possible to waste a lot of CPU time due to loops. The problem is not limited to 5 recursive calls per packet: any packet can be sent to multiple destinations, which in turn can themselves be sent to multiple destinations, and so on. We have actually seen in practice a case where each packet was, apparently, sent to at least 2 destinations per hop, so that each packet actually consumed CPU time for 2**5 == 32 packets, possibly more. This commit takes loop suppression a step further, by clearing the actions of flows that are implicated in loops. Thus, after the first packet in such a flow, later packets for either the "root" flow or for flows that it ends up looping through are simply discarded, saving a huge amount of CPU time. This version of the commit just clears the actions from the flows that a part of the loop. Probably, there should be some additional action to tell ovs-vswitchd that a loop has been detected, so that it can in turn inform the controller one way or another. My test case was this: ovs-controller -H --max-idle=permanent punix:/tmp/controller ovs-vsctl -- \ set-controller br0 unix:/tmp/controller -- \ add-port br0 patch00 -- \ add-port br0 patch01 -- \ add-port br0 patch10 -- \ add-port br0 patch11 -- \ add-port br0 patch20 -- \ add-port br0 patch21 -- \ add-port br0 patch30 -- \ add-port br0 patch31 -- \ set Interface patch00 type=patch options:peer=patch01 -- \ set Interface patch01 type=patch options:peer=patch00 -- \ set Interface patch10 type=patch options:peer=patch11 -- \ set Interface patch11 type=patch options:peer=patch10 -- \ set Interface patch20 type=patch options:peer=patch21 -- \ set Interface patch21 type=patch options:peer=patch20 -- \ set Interface patch30 type=patch options:peer=patch31 -- \ set Interface patch31 type=patch options:peer=patch30 followed by sending a single "ping" packet from an attached Ethernet port into the bridge. After this, without this commit the vswitch userspace and kernel consume 50-75% of the machine's CPU (in my KVM test setup on a single physical host); with this commit, some CPU is consumed initially but it converges on 0% quickly. A more challenging test sends a series of packets in multiple flows; I used "hping3" with its default options. Without this commit, the vswitch consumes 100% of the machine's CPU, most of which is in the kernel. With this commit, the vswitch consumes "only" 33-50% CPU, most of which is in userspace, so the machine is more responsive. A refinement on this commit would be to pass the loop counter down to userspace as part of the odp_msg struct and then back up as part of the ODP_EXECUTE command arguments. This would, presumably, reduce the CPU requirements, since it would allow loop detection to happen earlier, during initial setup of flows, instead of just on the second and subsequent packets of flows. --- datapath/datapath.c | 71 +++++++++++++++++++++++++++++++++++++++------ datapath/vport.c | 31 ++------------------ 2 files changed, 64 insertions(+), 38 deletions(-) diff --git a/datapath/datapath.c b/datapath/datapath.c index c3402849..e4681987 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -69,6 +69,23 @@ EXPORT_SYMBOL(dp_ioctl_hook); static struct datapath *dps[ODP_MAX]; static DEFINE_MUTEX(dp_mutex); +/* We limit the number of times that we pass into dp_process_received_packet() + * to avoid blowing out the stack in the event that we have a loop. */ +struct loop_counter { + int count; /* Count. */ + bool looping; /* Loop detected? */ +}; + +#define DP_MAX_LOOPS 5 + +/* We use a separate counter for each CPU for both interrupt and non-interrupt + * context in order to keep the limit deterministic for a given packet. */ +struct percpu_loop_counters { + struct loop_counter counters[2]; +}; + +static DEFINE_PER_CPU(struct percpu_loop_counters, dp_loop_counters); + static int new_dp_port(struct datapath *, struct odp_port *, int port_no); /* Must be called with rcu_read_lock or dp_mutex. */ @@ -511,6 +528,14 @@ out: return err; } +static void suppress_loop(struct datapath *dp, struct sw_flow_actions *actions) +{ + if (net_ratelimit()) + printk(KERN_WARNING "%s: flow looped %d times, dropping\n", + dp_name(dp), DP_MAX_LOOPS); + actions->n_actions = 0; +} + /* Must be called with rcu_read_lock. */ void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb) { @@ -519,9 +544,13 @@ void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb) int stats_counter_off; struct odp_flow_key key; struct tbl_node *flow_node; + struct sw_flow *flow; + struct sw_flow_actions *acts; + struct loop_counter *loop; OVS_CB(skb)->dp_port = p; + /* Extract flow from 'skb' into 'key'. */ if (flow_extract(skb, p ? p->port_no : ODPP_NONE, &key)) { if (dp->drop_frags) { kfree_skb(skb); @@ -530,20 +559,44 @@ void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb) } } + /* Look up flow. */ flow_node = tbl_lookup(rcu_dereference(dp->table), &key, flow_hash(&key), flow_cmp); - if (flow_node) { - struct sw_flow *flow = flow_cast(flow_node); - struct sw_flow_actions *acts = rcu_dereference(flow->sf_acts); - flow_used(flow, skb); - execute_actions(dp, skb, &key, acts->actions, acts->n_actions, - GFP_ATOMIC); - stats_counter_off = offsetof(struct dp_stats_percpu, n_hit); - } else { - stats_counter_off = offsetof(struct dp_stats_percpu, n_missed); + if (unlikely(!flow_node)) { dp_output_control(dp, skb, _ODPL_MISS_NR, OVS_CB(skb)->tun_id); + stats_counter_off = offsetof(struct dp_stats_percpu, n_missed); + goto out; + } + + flow = flow_cast(flow_node); + flow_used(flow, skb); + + acts = rcu_dereference(flow->sf_acts); + + /* Check whether we've looped too much. */ + loop = &get_cpu_var(dp_loop_counters).counters[!!in_interrupt()]; + if (unlikely(++loop->count > DP_MAX_LOOPS)) + loop->looping = true; + if (unlikely(loop->looping)) { + suppress_loop(dp, acts); + goto out_loop; } + /* Execute actions. */ + execute_actions(dp, skb, &key, acts->actions, acts->n_actions, GFP_ATOMIC); + stats_counter_off = offsetof(struct dp_stats_percpu, n_hit); + + /* Check whether sub-actions looped too much. */ + if (unlikely(loop->looping)) + suppress_loop(dp, acts); + +out_loop: + /* Decrement loop counter. */ + if (!--loop->count) + loop->looping = false; + put_cpu_var(dp_loop_counters); + out: + /* Update datapath statistics. */ local_bh_disable(); stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id()); (*(u64 *)((u8 *)stats + stats_counter_off))++; diff --git a/datapath/vport.c b/datapath/vport.c index 24385901..dd1c31f3 100644 --- a/datapath/vport.c +++ b/datapath/vport.c @@ -35,17 +35,6 @@ static int n_vport_types; static struct hlist_head *dev_table; #define VPORT_HASH_BUCKETS 1024 -/* We limit the number of times that we pass through vport_send() to - * avoid blowing out the stack in the event that we have a loop. There is - * a separate counter for each CPU for both interrupt and non-interrupt - * context in order to keep the limit deterministic for a given packet. */ -struct percpu_loop_counter { - int count[2]; -}; - -static DEFINE_PER_CPU(struct percpu_loop_counter, vport_loop_counter); -#define VPORT_MAX_LOOPS 5 - /* Both RTNL lock and vport_mutex need to be held when updating dev_table. * * If you use vport_locate and then perform some operations, you need to hold @@ -1238,20 +1227,9 @@ static inline unsigned packet_length(const struct sk_buff *skb) */ int vport_send(struct vport *vport, struct sk_buff *skb) { - int *loop_count; int mtu; int sent; - loop_count = &get_cpu_var(vport_loop_counter).count[!!in_interrupt()]; - (*loop_count)++; - - if (unlikely(*loop_count > VPORT_MAX_LOOPS)) { - if (net_ratelimit()) - printk(KERN_WARNING "%s: dropping packet that has looped more than %d times\n", - dp_name(vport_get_dp_port(vport)->dp), VPORT_MAX_LOOPS); - goto error; - } - mtu = vport_get_mtu(vport); if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { if (net_ratelimit()) @@ -1274,17 +1252,12 @@ int vport_send(struct vport *vport, struct sk_buff *skb) local_bh_enable(); } - goto out; + return sent; error: - sent = 0; kfree_skb(skb); vport_record_error(vport, VPORT_E_TX_DROPPED); -out: - (*loop_count)--; - put_cpu_var(vport_loop_counter); - - return sent; + return 0; } /** -- 2.30.2