From: Ben Pfaff Date: Sat, 20 Dec 2008 00:33:31 +0000 (-0800) Subject: vswitch: Implement basic bonding. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=197fe95e514290a9e22423934af6859fb40da7f3;p=openvswitch vswitch: Implement basic bonding. Rebalancing and link failure detection are missing, but the basics are there (and work OK in simple testing). --- diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 94baf395..e7571402 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -48,6 +48,7 @@ #include "dirs.h" #include "dpif.h" #include "flow.h" +#include "hash.h" #include "list.h" #include "mac-learning.h" #include "netdev.h" @@ -71,6 +72,12 @@ struct iface { int dp_ifidx; /* Index within kernel datapath. */ }; +#define BOND_MASK 0xff +struct bond_entry { + int iface_idx; /* Index of assigned iface, or -1 if none. */ +}; + +#define FLOOD_PORT ((struct port *) 1) /* The 'flood' output port. */ struct port { struct bridge *bridge; size_t port_idx; @@ -81,6 +88,10 @@ struct port { * A bridge port for bonding has at least 2 interfaces. */ struct iface **ifaces; size_t n_ifaces, allocated_ifaces; + + /* Bonding info. */ + struct bond_entry *bond_hash; /* An array of (BOND_MASK + 1) elements. */ + int active_iface; }; #define DP_MAX_PORTS 255 @@ -730,6 +741,24 @@ struct output { uint16_t dp_ifidx; }; +static struct iface * +choose_output_iface(const struct port *port, const struct flow *flow) +{ + assert(port->n_ifaces); + if (port->n_ifaces == 1) { + return port->ifaces[0]; + } else { + size_t h = hash_fnv(flow->dl_src, sizeof flow->dl_src, HASH_FNV_BASIS); + struct bond_entry *e = &port->bond_hash[h & BOND_MASK]; + if (e->iface_idx < 0 || e->iface_idx >= port->n_ifaces) { + /* XXX select interface properly */ + static int count = 0; + e->iface_idx = count++ % port->n_ifaces; + } + return port->ifaces[e->iface_idx]; + } +} + static void set_output(struct output *p, const struct flow *flow, const struct port *in_port, const struct port *out_port) @@ -737,7 +766,7 @@ set_output(struct output *p, const struct flow *flow, p->vlan = (out_port->vlan ? OFP_VLAN_NONE : in_port->vlan ? in_port->vlan : ntohs(flow->dl_vlan)); - p->dp_ifidx = out_port->ifaces[0]->dp_ifidx; + p->dp_ifidx = choose_output_iface(out_port, flow)->dp_ifidx; } static void * @@ -787,10 +816,7 @@ put_actions(const struct bridge *br, const struct flow *flow, uint16_t vlan, } n_outs = 0; - if (out_port) { - /* Unicast. */ - set_output(&outs[n_outs++], flow, in_port, out_port); - } else { + if (out_port == FLOOD_PORT) { /* Flood. */ size_t i; @@ -800,6 +826,9 @@ put_actions(const struct bridge *br, const struct flow *flow, uint16_t vlan, set_output(&outs[n_outs++], flow, in_port, op); } } + } else if (out_port) { + /* Unicast. */ + set_output(&outs[n_outs++], flow, in_port, out_port); } actions_ofs = buf->size; @@ -834,12 +863,12 @@ send_packets(struct bridge *br, const struct flow *flow, uint32_t buffer_id, { struct ofpbuf *fbuf = NULL; struct ofpbuf *pbuf = NULL; + void *actions = NULL; - size_t actions_len = 0; + size_t actions_len = sizeof(struct ofp_action_header) * 4; /* Estimated. */ if (setup_flow) { - fbuf = make_add_flow(flow, buffer_id, br->flow_idle_time, - sizeof(struct ofp_action_header) * 4); + fbuf = make_add_flow(flow, buffer_id, br->flow_idle_time, actions_len); put_actions(br, flow, vlan, in_port, out_port, fbuf, &actions, &actions_len); update_openflow_length(fbuf); @@ -856,9 +885,10 @@ send_packets(struct bridge *br, const struct flow *flow, uint32_t buffer_id, opo = put_openflow(sizeof *opo, OFPT_PACKET_OUT, pbuf); opo->buffer_id = htonl(buffer_id); opo->in_port = htons(in_ifidx); - opo->actions_len = htons(actions_len); put_actions(br, flow, vlan, in_port, out_port, pbuf, &actions, &actions_len); + opo = pbuf->data; + opo->actions_len = htons(actions_len); if (buffer_id == UINT32_MAX) { ofpbuf_put(pbuf, pkt_data, pkt_len); } @@ -873,39 +903,52 @@ send_packets(struct bridge *br, const struct flow *flow, uint32_t buffer_id, } } +static bool +is_bcast_arp_reply(const struct flow *flow, const struct ofpbuf *pkt) +{ + return (flow->dl_type == htons(ETH_TYPE_ARP) + && eth_addr_is_broadcast(flow->dl_dst) + && pkt->size >= sizeof(struct arp_eth_header) + && ((struct arp_eth_header *) pkt->data)->ar_op == ARP_OP_REQUEST); +} + static void process_packet_in(struct bridge *br, void *opi_) { struct ofp_packet_in *opi = opi_; uint16_t in_ifidx = ntohs(opi->in_port); - uint16_t out_ifidx; struct ofpbuf pkt; struct flow flow; - struct iface *ifa; - struct port *in_port, *out_port; + struct iface *in_iface; + struct port *in_port; + struct port *out_port = NULL; /* By default, drop the packet/flow. */ int vlan; + /* Validate Openflow message. */ if (check_ofp_message_array(&opi->header, OFPT_PACKET_IN, offsetof(struct ofp_packet_in, data), 1, &pkt.size)) { return; } + /* Extract flow data from 'opi' into 'flow'. */ + pkt.data = opi->data; + flow_extract(&pkt, in_ifidx, &flow); + /* Find the interface and port structure for the received packet. */ - if (in_ifidx < 0 || in_ifidx >= ARRAY_SIZE(br->ifaces) + if (in_ifidx < 0 + || in_ifidx >= ARRAY_SIZE(br->ifaces) || !br->ifaces[in_ifidx]) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); VLOG_WARN_RL(&rl, "bridge %s: received packet on unknown " "interface %"PRIu16, br->name, in_ifidx); - goto drop; + queue_tx(br, make_add_flow(&flow, ntohl(opi->buffer_id), + br->flow_idle_time, 0)); + return; } - ifa = br->ifaces[in_ifidx]; - in_port = ifa->port; - - /* Extract flow data from 'opi' into 'flow'. */ - pkt.data = opi->data; - flow_extract(&pkt, in_ifidx, &flow); + in_iface = br->ifaces[in_ifidx]; + in_port = in_iface->port; /* Figure out what VLAN this packet belongs to. * @@ -927,18 +970,50 @@ process_packet_in(struct bridge *br, void *opi_) "VLAN %"PRIu16, br->name, ntohs(flow.dl_vlan), in_port->name, in_port->vlan); - goto drop; + goto done; } vlan = in_port->vlan; } + /* Drop multicast and broadcast packets on inactive bonded interfaces, to + * avoid receiving duplicates. */ + if (in_port->n_ifaces > 0 + && in_port->active_iface != in_iface->port_ifidx + && eth_addr_is_multicast(flow.dl_dst)) { + goto done; + } + /* MAC learning. */ - out_port = NULL; + out_port = FLOOD_PORT; if (br->ml) { uint16_t out_port_idx; + bool may_learn; + /* XXX flush learning table entries when port indexes change due to * reconfiguration */ - if (mac_learning_learn(br->ml, flow.dl_src, vlan, in_port->port_idx)) { + + /* If the packet arrived on a bonded port, don't learn from it unless + * we haven't learned any port at all for that address (because we + * probably sent the packet on one bonded interface and got it back on + * the other). */ + if (in_port->n_ifaces > 1) { + uint16_t src_idx = mac_learning_lookup(br->ml, flow.dl_src, vlan); + may_learn = src_idx == OFPP_FLOOD || src_idx == in_port->port_idx; + + /* Broadcast ARP replies are an exception to this rule: the host + * has moved to another switch. */ + if (!may_learn && is_bcast_arp_reply(&flow, &pkt)) { + /* OpenFlow can't tell ARP requests from replies so we can't + * set up a flow. Fortunately these packets should be rare. */ + may_learn = true; + } + } else { + may_learn = true; + } + + /* Learn source MAC. */ + if (may_learn && + mac_learning_learn(br->ml, flow.dl_src, vlan, in_port->port_idx)) { /* The log messages here could actually be useful in debugging, so * keep the rate limit relatively high. */ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300); @@ -947,35 +1022,38 @@ process_packet_in(struct bridge *br, void *opi_) br->name, ETH_ADDR_ARGS(flow.dl_src), in_port->name, vlan); } + + /* Determine output port. */ out_port_idx = mac_learning_lookup(br->ml, flow.dl_dst, vlan); if (out_port_idx < br->n_ports) { out_port = br->ports[out_port_idx]; } } - /* Send it out. */ - out_ifidx = out_port ? out_port->ifaces[0]->dp_ifidx : OFPP_FLOOD; - if (in_port != out_port) { - /* Add a new flow. */ - send_packets(br, &flow, ntohl(opi->buffer_id), vlan, - in_ifidx, pkt.data, pkt.size, in_port, out_port, - br->flow_idle_time >= 0); - } else { - /* Don't send out packets on their input ports. */ - goto drop; + /* Don't send packets out their input ports. */ + if (in_port == out_port) { + out_port = NULL; } - return; -drop: - if (br->flow_idle_time >= 0) { - /* Set up a flow to drop packets. */ - queue_tx(br, make_add_flow(&flow, ntohl(opi->buffer_id), - br->flow_idle_time, 0)); - } else { - /* Just drop the packet, since we don't set up flows at all. - * XXX we should send a packet_out with no actions if buffer_id != - * UINT32_MAX, to avoid clogging the kernel buffers. */ - } + /* + * Add a new flow. + * + * We send out only a single packet, instead of setting up a flow, if: + * + * - Flows are disabled entirely; or + * + * - The packet is an ARP directed to broadcast that arrived on a bonded + * interface. In such a situation ARP request and replies must be + * handled differently, but OpenFlow unfortunately can't distinguish + * them. + */ +done: + send_packets(br, &flow, ntohl(opi->buffer_id), vlan, + in_ifidx, pkt.data, pkt.size, in_port, out_port, + (br->flow_idle_time >= 0 + && (in_port->n_ifaces < 2 + || flow.dl_type != htons(ETH_TYPE_ARP) + || !eth_addr_is_broadcast(flow.dl_dst)))); } static void @@ -1083,6 +1161,27 @@ port_destroy(struct port *port) free(port); } } + +static void +port_update_bonding(struct port *port) +{ + if (port->n_ifaces < 2) { + /* Not a bonded port. */ + free(port->bond_hash); + port->bond_hash = NULL; + } else { + if (!port->bond_hash) { + size_t i; + + port->bond_hash = xcalloc(BOND_MASK + 1, sizeof *port->bond_hash); + for (i = 0; i <= BOND_MASK; i++) { + struct bond_entry *e = &port->bond_hash[i]; + e->iface_idx = -1; + } + port->active_iface = 0; + } + } +} /* Interface functions. */ @@ -1102,6 +1201,8 @@ iface_create(struct port *port, const char *name) port->ifaces[port->n_ifaces++] = iface; VLOG_DBG("attached network device %s to port %s", iface->name, port->name); + + port_update_bonding(port); } static void @@ -1112,6 +1213,8 @@ iface_destroy(struct iface *iface) port->ifaces[iface->port_ifidx] = port->ifaces[--port->n_ifaces]; free(iface->name); free(iface); + + port_update_bonding(port); } }