From 9d6ac44e2b584b34bc7e14f2daa7bb0bdfab16ab Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Wed, 18 Apr 2012 17:11:10 -0700 Subject: [PATCH] ofproto-dpif: Implement "flow setup governor" to speed up many short flows. The cost of creating and initializing facets and subfacets and installing, tracking, and uninstalling kernel flows is significant. When most flows have only one or a few packets, this overhead is higher than the cost of handling each packet individually. This commit introduces heuristics that cheaply count (approximately) the number of packets seen in a flow and skips most of this expensive bookkeeping until the packet count exceeds a threshold (currently 5 packets). Signed-off-by: Ben Pfaff --- ofproto/automake.mk | 4 +- ofproto/ofproto-dpif-governor.c | 188 ++++++++++++++++++++ ofproto/ofproto-dpif-governor.h | 53 ++++++ ofproto/ofproto-dpif.c | 292 +++++++++++++++++++++++--------- 4 files changed, 459 insertions(+), 78 deletions(-) create mode 100644 ofproto/ofproto-dpif-governor.c create mode 100644 ofproto/ofproto-dpif-governor.h diff --git a/ofproto/automake.mk b/ofproto/automake.mk index ae35b7ff..6d98de73 100644 --- a/ofproto/automake.mk +++ b/ofproto/automake.mk @@ -1,4 +1,4 @@ -# Copyright (C) 2009, 2010, 2011 Nicira Networks, Inc. +# Copyright (C) 2009, 2010, 2011, 2012 Nicira Networks, Inc. # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright @@ -21,6 +21,8 @@ ofproto_libofproto_a_SOURCES = \ ofproto/ofproto.c \ ofproto/ofproto.h \ ofproto/ofproto-dpif.c \ + ofproto/ofproto-dpif-governor.c \ + ofproto/ofproto-dpif-governor.h \ ofproto/ofproto-dpif-sflow.c \ ofproto/ofproto-dpif-sflow.h \ ofproto/ofproto-provider.h \ diff --git a/ofproto/ofproto-dpif-governor.c b/ofproto/ofproto-dpif-governor.c new file mode 100644 index 00000000..3b15c38e --- /dev/null +++ b/ofproto/ofproto-dpif-governor.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2012 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "ofproto-dpif-governor.h" + +#include +#include + +#include "coverage.h" +#include "poll-loop.h" +#include "random.h" +#include "timeval.h" +#include "util.h" +#include "valgrind.h" +#include "vlog.h" + +VLOG_DEFINE_THIS_MODULE(ofproto_dpif_governor); + +/* Minimum number of observed packets before setting up a flow. + * + * This value seems OK empirically. */ +#define FLOW_SETUP_THRESHOLD 5 +BUILD_ASSERT_DECL(FLOW_SETUP_THRESHOLD > 1); +BUILD_ASSERT_DECL(FLOW_SETUP_THRESHOLD < 16); + +/* Minimum and maximum size of a governor, in bytes. */ +enum { MIN_SIZE = 16 * 1024 }; +enum { MAX_SIZE = 256 * 1024 }; +BUILD_ASSERT_DECL(IS_POW2(MIN_SIZE)); +BUILD_ASSERT_DECL(IS_POW2(MAX_SIZE)); + +/* Minimum and maximum time to process the number of packets that make up a + * given generation. If a generation completes faster than the minimum time, + * we double the table size (but no more than MAX_SIZE). If a generation take + * more than the maximum time to complete, we halve the table size (but no + * smaller than MIN_SIZE). */ +enum { MIN_ELAPSED = 1000 }; /* In milliseconds. */ +enum { MAX_ELAPSED = 5000 }; /* In milliseconds. */ + +static void governor_new_generation(struct governor *, unsigned int size); + +/* Creates and returns a new governor named 'name' (which is used only for log + * messages). */ +struct governor * +governor_create(const char *name) +{ + struct governor *g = xzalloc(sizeof *g); + g->name = xstrdup(name); + governor_new_generation(g, MIN_SIZE); + return g; +} + +/* Destroys 'g'. */ +void +governor_destroy(struct governor *g) +{ + if (g) { + VLOG_INFO("%s: disengaging", g->name); + free(g->table); + free(g); + } +} + +/* Performs periodic maintenance work on 'g'. */ +void +governor_run(struct governor *g) +{ + if (time_msec() - g->start > MAX_ELAPSED) { + if (g->size > MIN_SIZE) { + governor_new_generation(g, g->size / 2); + } else { + /* Don't start a new generation (we'd never go idle). */ + } + } +} + +/* Arranges for the poll loop to wake up when 'g' needs to do some work. */ +void +governor_wait(struct governor *g) +{ + poll_timer_wait_until(g->start + MAX_ELAPSED); +} + +/* Returns true if 'g' has been doing only a minimal amount of work and thus + * the client should consider getting rid of it entirely. */ +bool +governor_is_idle(struct governor *g) +{ + return g->size == MIN_SIZE && time_msec() - g->start > MAX_ELAPSED; +} + +/* Tests whether a flow whose hash is 'hash' and for which 'n' packets have + * just arrived should be set up in the datapath or just processed on a + * packet-by-packet basis. Returns true to set up a datapath flow, false to + * process the packets individually. + * + * One would expect 'n' to ordinarily be 1, if batching leads multiple packets + * to be processed at a time then it could be greater. */ +bool +governor_should_install_flow(struct governor *g, uint32_t hash, int n) +{ + bool install_flow; + uint8_t *e; + int count; + + assert(n > 0); + + /* Count these packets and begin a new generation if necessary. */ + g->n_packets += n; + if (g->n_packets >= g->size / 4) { + unsigned int new_size; + long long elapsed; + + elapsed = time_msec() - g->start; + new_size = (elapsed < MIN_ELAPSED && g->size < MAX_SIZE ? g->size * 2 + : elapsed > MAX_ELAPSED && g->size > MIN_SIZE ? g->size / 2 + : g->size); + governor_new_generation(g, new_size); + } + + /* Do hash table processing. + * + * Even-numbered hash values use high-order nibbles. + * Odd-numbered hash values use low-order nibbles. */ + e = &g->table[(hash >> 1) & (g->size - 1)]; + count = n + (hash & 1 ? *e >> 4 : *e & 0x0f); + if (count >= FLOW_SETUP_THRESHOLD) { + install_flow = true; + count = 0; + } else { + install_flow = false; + } + *e = hash & 1 ? (count << 4) | (*e & 0x0f) : (*e & 0xf0) | count; + + return install_flow; +} + +/* Starts a new generation in 'g' with a table size of 'size' bytes. 'size' + * must be a power of two between MIN_SIZE and MAX_SIZE, inclusive. */ +static void +governor_new_generation(struct governor *g, unsigned int size) +{ + assert(size >= MIN_SIZE && size <= MAX_SIZE); + assert(is_pow2(size)); + + /* Allocate new table, if necessary. */ + if (g->size != size) { + if (!g->size) { + VLOG_INFO("%s: engaging governor with %u kB hash table", + g->name, g->size / 1024); + } else { + VLOG_INFO("%s: processed %u packets in %.2f s, " + "%s hash table to %u kB", + g->name, g->n_packets, + (time_msec() - g->start) / 1000.0, + size > g->size ? "enlarging" : "shrinking", + size / 1024); + } + + free(g->table); + g->table = xmalloc(size * sizeof *g->table); + g->size = size; + } else { + VLOG_DBG("%s: processed %u packets in %.2f s with %u kB hash table", + g->name, g->n_packets, (time_msec() - g->start) / 1000.0, + size / 1024); + } + + /* Clear data for next generation. */ + memset(g->table, 0, size * sizeof *g->table); + g->start = time_msec(); + g->n_packets = 0; +} diff --git a/ofproto/ofproto-dpif-governor.h b/ofproto/ofproto-dpif-governor.h new file mode 100644 index 00000000..ad022d5e --- /dev/null +++ b/ofproto/ofproto-dpif-governor.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2012 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OFPROTO_DPIF_GOVERNOR_H +#define OFPROTO_DPIF_GOVERNOR_H 1 + +/* Flow setup rate limiter. + * + * A governor in an engine limits a vehicle's speed. This governor limits the + * rate at which flows are set up in the datapath. The client provides as + * input the hashes of observed packets. The governor keeps track of hashes + * seen multiple times. When a given hash is seen often enough, the governor + * indicates to its client that it should set up a facet and a subfacet and a + * datapath flow for that flow. + * + * The same tracking could be done in terms of facets and subfacets directly, + * but the governor code uses much less time and space to do the same job. */ + +#include +#include + +struct governor { + char *name; /* Name, for log messages. */ + uint8_t *table; /* Table of counters, two per byte. */ + unsigned int size; /* Table size in bytes. */ + long long int start; /* Time when the table was last cleared. */ + unsigned int n_packets; /* Number of packets processed. */ +}; + +struct governor *governor_create(const char *name); +void governor_destroy(struct governor *); + +void governor_run(struct governor *); +void governor_wait(struct governor *); + +bool governor_is_idle(struct governor *); + +bool governor_should_install_flow(struct governor *, uint32_t hash, int n); + +#endif /* ofproto/ofproto-dpif-governor.h */ diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index aceb4bda..2d7b3c60 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -43,6 +43,7 @@ #include "ofp-util.h" #include "ofpbuf.h" #include "ofp-print.h" +#include "ofproto-dpif-governor.h" #include "ofproto-dpif-sflow.h" #include "poll-loop.h" #include "timer.h" @@ -61,6 +62,7 @@ COVERAGE_DEFINE(facet_changed_rule); COVERAGE_DEFINE(facet_invalidated); COVERAGE_DEFINE(facet_revalidate); COVERAGE_DEFINE(facet_unexpected); +COVERAGE_DEFINE(facet_suppress); /* Maximum depth of flow table recursion (due to resubmit actions) in a * flow translation. */ @@ -545,6 +547,7 @@ struct ofproto_dpif { /* Facets. */ struct hmap facets; struct hmap subfacets; + struct governor *governor; /* Revalidation. */ struct table_dpif tables[N_TABLES]; @@ -700,6 +703,7 @@ construct(struct ofproto *ofproto_) hmap_init(&ofproto->facets); hmap_init(&ofproto->subfacets); + ofproto->governor = NULL; for (i = 0; i < N_TABLES; i++) { struct table_dpif *table = &ofproto->tables[i]; @@ -772,6 +776,7 @@ destruct(struct ofproto *ofproto_) hmap_destroy(&ofproto->facets); hmap_destroy(&ofproto->subfacets); + governor_destroy(ofproto->governor); hmap_destroy(&ofproto->vlandev_map); hmap_destroy(&ofproto->realdev_vid_map); @@ -879,6 +884,24 @@ run(struct ofproto *ofproto_) } } + if (ofproto->governor) { + size_t n_subfacets; + + governor_run(ofproto->governor); + + /* If the governor has shrunk to its minimum size and the number of + * subfacets has dwindled, then drop the governor entirely. + * + * For hysteresis, the number of subfacets to drop the governor is + * smaller than the number needed to trigger its creation. */ + n_subfacets = hmap_count(&ofproto->subfacets); + if (n_subfacets * 4 < ofproto->up.flow_eviction_threshold + && governor_is_idle(ofproto->governor)) { + governor_destroy(ofproto->governor); + ofproto->governor = NULL; + } + } + return 0; } @@ -919,6 +942,9 @@ wait(struct ofproto *ofproto_) } else { timer_wait(&ofproto->next_expiration); } + if (ofproto->governor) { + governor_wait(ofproto->governor); + } } static void @@ -2566,48 +2592,136 @@ flow_miss_find(struct hmap *todo, const struct flow *flow, uint32_t hash) return NULL; } +/* Partially Initializes 'op' as an "execute" operation for 'miss' and + * 'packet'. The caller must initialize op->actions and op->actions_len. If + * 'miss' is associated with a subfacet the caller must also initialize the + * returned op->subfacet, and if anything needs to be freed after processing + * the op, the caller must initialize op->garbage also. */ static void -handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, - struct flow_miss_op *ops, size_t *n_ops) +init_flow_miss_execute_op(struct flow_miss *miss, struct ofpbuf *packet, + struct flow_miss_op *op) +{ + if (miss->flow.vlan_tci != miss->initial_tci) { + /* This packet was received on a VLAN splinter port. We + * added a VLAN to the packet to make the packet resemble + * the flow, but the actions were composed assuming that + * the packet contained no VLAN. So, we must remove the + * VLAN header from the packet before trying to execute the + * actions. */ + eth_pop_vlan(packet); + } + + op->subfacet = NULL; + op->garbage = NULL; + op->dpif_op.type = DPIF_OP_EXECUTE; + op->dpif_op.u.execute.key = miss->key; + op->dpif_op.u.execute.key_len = miss->key_len; + op->dpif_op.u.execute.packet = packet; +} + +/* Helper for handle_flow_miss_without_facet() and + * handle_flow_miss_with_facet(). */ +static void +handle_flow_miss_common(struct rule_dpif *rule, + struct ofpbuf *packet, const struct flow *flow) { - const struct flow *flow = &miss->flow; - struct subfacet *subfacet; + struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto); + + ofproto->n_matches++; + + if (rule->up.cr.priority == FAIL_OPEN_PRIORITY) { + /* + * Extra-special case for fail-open mode. + * + * We are in fail-open mode and the packet matched the fail-open + * rule, but we are connected to a controller too. We should send + * the packet up to the controller in the hope that it will try to + * set up a flow and thereby allow us to exit fail-open. + * + * See the top-level comment in fail-open.c for more information. + */ + send_packet_in_miss(ofproto, packet, flow); + } +} + +/* Figures out whether a flow that missed in 'ofproto', whose details are in + * 'miss', is likely to be worth tracking in detail in userspace and (usually) + * installing a datapath flow. The answer is usually "yes" (a return value of + * true). However, for short flows the cost of bookkeeping is much higher than + * the benefits, so when the datapath holds a large number of flows we impose + * some heuristics to decide which flows are likely to be worth tracking. */ +static bool +flow_miss_should_make_facet(struct ofproto_dpif *ofproto, + struct flow_miss *miss, uint32_t hash) +{ + if (!ofproto->governor) { + size_t n_subfacets; + + n_subfacets = hmap_count(&ofproto->subfacets); + if (n_subfacets * 2 <= ofproto->up.flow_eviction_threshold) { + return true; + } + + ofproto->governor = governor_create(ofproto->up.name); + } + + return governor_should_install_flow(ofproto->governor, hash, + list_size(&miss->packets)); +} + +/* Handles 'miss', which matches 'rule', without creating a facet or subfacet + * or creating any datapath flow. May add an "execute" operation to 'ops' and + * increment '*n_ops'. */ +static void +handle_flow_miss_without_facet(struct flow_miss *miss, + struct rule_dpif *rule, + struct flow_miss_op *ops, size_t *n_ops) +{ + struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto); + struct action_xlate_ctx ctx; struct ofpbuf *packet; - struct facet *facet; - uint32_t hash; - /* The caller must ensure that miss->hmap_node.hash contains - * flow_hash(miss->flow, 0). */ - hash = miss->hmap_node.hash; + LIST_FOR_EACH (packet, list_node, &miss->packets) { + struct flow_miss_op *op = &ops[*n_ops]; + struct dpif_flow_stats stats; + struct ofpbuf odp_actions; - facet = facet_lookup_valid(ofproto, flow, hash); - if (!facet) { - struct rule_dpif *rule; + COVERAGE_INC(facet_suppress); - rule = rule_dpif_lookup(ofproto, flow, 0); - if (!rule) { - /* Don't send a packet-in if OFPUTIL_PC_NO_PACKET_IN asserted. */ - struct ofport_dpif *port = get_ofp_port(ofproto, flow->in_port); - if (port) { - if (port->up.pp.config & OFPUTIL_PC_NO_PACKET_IN) { - COVERAGE_INC(ofproto_dpif_no_packet_in); - /* XXX install 'drop' flow entry */ - return; - } - } else { - VLOG_WARN_RL(&rl, "packet-in on unknown port %"PRIu16, - flow->in_port); - } + ofpbuf_use_stub(&odp_actions, op->stub, sizeof op->stub); - LIST_FOR_EACH (packet, list_node, &miss->packets) { - send_packet_in_miss(ofproto, packet, flow); - } + dpif_flow_stats_extract(&miss->flow, packet, &stats); + rule_credit_stats(rule, &stats); - return; - } + action_xlate_ctx_init(&ctx, ofproto, &miss->flow, miss->initial_tci, + rule, 0, packet); + ctx.resubmit_stats = &stats; + xlate_actions(&ctx, rule->up.actions, rule->up.n_actions, + &odp_actions); + + if (odp_actions.size) { + struct dpif_execute *execute = &op->dpif_op.u.execute; + + init_flow_miss_execute_op(miss, packet, op); + execute->actions = odp_actions.data; + execute->actions_len = odp_actions.size; + op->garbage = ofpbuf_get_uninit_pointer(&odp_actions); - facet = facet_create(rule, flow, hash); + (*n_ops)++; + } else { + ofpbuf_uninit(&odp_actions); + } } +} + +/* Handles 'miss', which matches 'facet'. May add any required datapath + * operations to 'ops', incrementing '*n_ops' for each new op. */ +static void +handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet, + struct flow_miss_op *ops, size_t *n_ops) +{ + struct subfacet *subfacet; + struct ofpbuf *packet; subfacet = subfacet_create(facet, miss->key_fitness, miss->key, miss->key_len, @@ -2615,26 +2729,10 @@ handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, LIST_FOR_EACH (packet, list_node, &miss->packets) { struct flow_miss_op *op = &ops[*n_ops]; - struct dpif_execute *execute = &op->dpif_op.u.execute; struct dpif_flow_stats stats; - struct ofpbuf odp_actions; - ofproto->n_matches++; - - if (facet->rule->up.cr.priority == FAIL_OPEN_PRIORITY) { - /* - * Extra-special case for fail-open mode. - * - * We are in fail-open mode and the packet matched the fail-open - * rule, but we are connected to a controller too. We should send - * the packet up to the controller in the hope that it will try to - * set up a flow and thereby allow us to exit fail-open. - * - * See the top-level comment in fail-open.c for more information. - */ - send_packet_in_miss(ofproto, packet, flow); - } + handle_flow_miss_common(facet->rule, packet, &miss->flow); ofpbuf_use_stub(&odp_actions, op->stub, sizeof op->stub); if (!facet->may_install || !subfacet->actions) { @@ -2644,38 +2742,25 @@ handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, dpif_flow_stats_extract(&facet->flow, packet, &stats); subfacet_update_stats(subfacet, &stats); - if (!subfacet->actions_len) { - /* No actions to execute, so skip talking to the dpif. */ - ofpbuf_uninit(&odp_actions); - continue; - } + if (subfacet->actions_len) { + struct dpif_execute *execute = &op->dpif_op.u.execute; - if (flow->vlan_tci != subfacet->initial_tci) { - /* This packet was received on a VLAN splinter port. We added - * a VLAN to the packet to make the packet resemble the flow, - * but the actions were composed assuming that the packet - * contained no VLAN. So, we must remove the VLAN header from - * the packet before trying to execute the actions. */ - eth_pop_vlan(packet); - } + init_flow_miss_execute_op(miss, packet, op); + op->subfacet = subfacet; + if (facet->may_install) { + execute->actions = subfacet->actions; + execute->actions_len = subfacet->actions_len; + ofpbuf_uninit(&odp_actions); + } else { + execute->actions = odp_actions.data; + execute->actions_len = odp_actions.size; + op->garbage = ofpbuf_get_uninit_pointer(&odp_actions); + } - /* Set up operation. */ - op->dpif_op.type = DPIF_OP_EXECUTE; - execute->key = miss->key; - execute->key_len = miss->key_len; - if (facet->may_install) { - execute->actions = subfacet->actions; - execute->actions_len = subfacet->actions_len; - ofpbuf_uninit(&odp_actions); - op->garbage = NULL; + (*n_ops)++; } else { - execute->actions = odp_actions.data; - execute->actions_len = odp_actions.size; - op->garbage = ofpbuf_get_uninit_pointer(&odp_actions); + ofpbuf_uninit(&odp_actions); } - execute->packet = packet; - - (*n_ops)++; } if (facet->may_install && subfacet->key_fitness != ODP_FIT_TOO_LITTLE) { @@ -2694,6 +2779,59 @@ handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, } } +/* Handles flow miss 'miss' on 'ofproto'. The flow does not match any flow in + * the OpenFlow flow table. */ +static void +handle_flow_miss_no_rule(struct ofproto_dpif *ofproto, struct flow_miss *miss) +{ + uint16_t in_port = miss->flow.in_port; + struct ofport_dpif *port = get_ofp_port(ofproto, in_port); + + if (!port) { + VLOG_WARN_RL(&rl, "packet-in on unknown port %"PRIu16, in_port); + } + + if (port && port->up.pp.config & OFPUTIL_PC_NO_PACKET_IN) { + /* XXX install 'drop' flow entry */ + COVERAGE_INC(ofproto_dpif_no_packet_in); + } else { + const struct ofpbuf *packet; + + LIST_FOR_EACH (packet, list_node, &miss->packets) { + send_packet_in_miss(ofproto, packet, &miss->flow); + } + } +} + +/* Handles flow miss 'miss' on 'ofproto'. May add any required datapath + * operations to 'ops', incrementing '*n_ops' for each new op. */ +static void +handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, + struct flow_miss_op *ops, size_t *n_ops) +{ + struct facet *facet; + uint32_t hash; + + /* The caller must ensure that miss->hmap_node.hash contains + * flow_hash(miss->flow, 0). */ + hash = miss->hmap_node.hash; + + facet = facet_lookup_valid(ofproto, &miss->flow, hash); + if (!facet) { + struct rule_dpif *rule = rule_dpif_lookup(ofproto, &miss->flow, 0); + if (!rule) { + handle_flow_miss_no_rule(ofproto, miss); + return; + } else if (!flow_miss_should_make_facet(ofproto, miss, hash)) { + handle_flow_miss_without_facet(miss, rule, ops, n_ops); + return; + } + + facet = facet_create(rule, &miss->flow, hash); + } + handle_flow_miss_with_facet(miss, facet, ops, n_ops); +} + /* Like odp_flow_key_to_flow(), this function converts the 'key_len' bytes of * OVS_KEY_ATTR_* attributes in 'key' to a flow structure in 'flow' and returns * an ODP_FIT_* value that indicates how well 'key' fits our expectations for -- 2.30.2