X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=vswitchd%2Fbridge.c;h=ad9b46ef3b4e4439e62f87c4d4c66e2afa2e4c1e;hb=2a577bd807023009afab02d975787d67389c4a27;hp=ec8a1997e44419a8f18783251e0971bb32f54e54;hpb=ba09980aaff81d5ba31aa221179740bc04680787;p=openvswitch diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index ec8a1997..ad9b46ef 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -48,6 +48,7 @@ #include "port-array.h" #include "proc-net-compat.h" #include "process.h" +#include "secchan/netflow.h" #include "secchan/ofproto.h" #include "socket-util.h" #include "stp.h" @@ -242,6 +243,8 @@ static void iface_destroy(struct iface *); static struct iface *iface_lookup(const struct bridge *, const char *name); static struct iface *iface_from_dp_ifidx(const struct bridge *, uint16_t dp_ifidx); +static bool iface_is_internal(const struct bridge *, const char *name); +static void iface_set_mac(struct iface *); /* Hooks into ofproto processing. */ static struct ofhooks bridge_ofhooks; @@ -461,9 +464,13 @@ bridge_reconfigure(void) for (i = 0; i < add_ifaces.n; i++) { const char *if_name = add_ifaces.names[i]; for (;;) { - int internal = cfg_get_bool(0, "iface.%s.internal", if_name); - int error = dpif_port_add(&br->dpif, if_name, next_port_no++, - internal ? ODP_PORT_INTERNAL : 0); + bool internal; + int error; + + /* Add to datapath. */ + internal = iface_is_internal(br, if_name); + error = dpif_port_add(&br->dpif, if_name, next_port_no++, + internal ? ODP_PORT_INTERNAL : 0); if (error != EEXIST) { if (next_port_no >= 256) { VLOG_ERR("ran out of valid port numbers on dp%u", @@ -488,10 +495,7 @@ bridge_reconfigure(void) uint64_t dpid; struct iface *local_iface = NULL; const char *devname; - uint8_t engine_type = br->dpif.minor; - uint8_t engine_id = br->dpif.minor; - bool add_id_to_iface = false; - struct svec nf_hosts; + struct netflow_options nf_options; bridge_fetch_dp_ifaces(br); for (i = 0; i < br->n_ports; ) { @@ -536,34 +540,46 @@ bridge_reconfigure(void) ofproto_set_datapath_id(br->ofproto, dpid); /* Set NetFlow configuration on this bridge. */ + memset(&nf_options, 0, sizeof nf_options); + nf_options.engine_type = br->dpif.minor; + nf_options.engine_id = br->dpif.minor; + nf_options.active_timeout = -1; + if (cfg_has("netflow.%s.engine-type", br->name)) { - engine_type = cfg_get_int(0, "netflow.%s.engine-type", + nf_options.engine_type = cfg_get_int(0, "netflow.%s.engine-type", br->name); } if (cfg_has("netflow.%s.engine-id", br->name)) { - engine_id = cfg_get_int(0, "netflow.%s.engine-id", br->name); + nf_options.engine_id = cfg_get_int(0, "netflow.%s.engine-id", + br->name); + } + if (cfg_has("netflow.%s.active-timeout", br->name)) { + nf_options.active_timeout = cfg_get_int(0, + "netflow.%s.active-timeout", + br->name); } if (cfg_has("netflow.%s.add-id-to-iface", br->name)) { - add_id_to_iface = cfg_get_bool(0, "netflow.%s.add-id-to-iface", - br->name); + nf_options.add_id_to_iface = cfg_get_bool(0, + "netflow.%s.add-id-to-iface", + br->name); } - if (add_id_to_iface && engine_id > 0x7f) { + if (nf_options.add_id_to_iface && nf_options.engine_id > 0x7f) { VLOG_WARN("bridge %s: netflow port mangling may conflict with " "another vswitch, choose an engine id less than 128", br->name); } - if (add_id_to_iface && br->n_ports > 0x1ff) { + if (nf_options.add_id_to_iface && br->n_ports > 508) { VLOG_WARN("bridge %s: netflow port mangling will conflict with " - "another port when 512 or more ports are used", + "another port when more than 508 ports are used", br->name); } - svec_init(&nf_hosts); - cfg_get_all_keys(&nf_hosts, "netflow.%s.host", br->name); - if (ofproto_set_netflow(br->ofproto, &nf_hosts, engine_type, - engine_id, add_id_to_iface)) { + svec_init(&nf_options.collectors); + cfg_get_all_keys(&nf_options.collectors, "netflow.%s.host", br->name); + if (ofproto_set_netflow(br->ofproto, &nf_options)) { VLOG_ERR("bridge %s: problem setting netflow collectors", br->name); } + svec_destroy(&nf_options.collectors); /* Update the controller and related settings. It would be more * straightforward to call this from bridge_reconfigure_one(), but we @@ -579,7 +595,16 @@ bridge_reconfigure(void) LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; + port_update_vlan_compat(port); + + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (iface->dp_ifidx != ODPP_LOCAL + && iface_is_internal(br, iface->name)) { + iface_set_mac(iface); + } + } } } LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { @@ -1321,9 +1346,12 @@ bridge_get_all_ifaces(const struct bridge *br, struct svec *ifaces) struct iface *iface = port->ifaces[j]; svec_add(ifaces, iface->name); } + if (port->n_ifaces > 1 + && cfg_get_bool(0, "bonding.%s.fake-iface", port->name)) { + svec_add(ifaces, port->name); + } } - svec_sort(ifaces); - assert(svec_is_unique(ifaces)); + svec_sort_unique(ifaces); } /* For robustness, in case the administrator moves around datapath ports behind @@ -1508,6 +1536,7 @@ bond_enable_slave(struct iface *iface, bool enable) } iface->tag = tag_create_random(); } + port_update_bond_compat(port); } static void @@ -1657,7 +1686,7 @@ port_includes_vlan(const struct port *port, uint16_t vlan) static size_t compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, const struct port *in_port, const struct port *out_port, - struct dst dsts[], tag_type *tags) + struct dst dsts[], tag_type *tags, uint16_t *nf_output_iface) { mirror_mask_t mirrors = in_port->src_mirrors; struct dst *dst = dsts; @@ -1676,7 +1705,9 @@ compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, dst++; } } + *nf_output_iface = NF_OUT_FLOOD; } else if (out_port && set_dst(dst, flow, in_port, out_port, tags)) { + *nf_output_iface = dst->dp_ifidx; mirrors |= out_port->dst_mirrors; dst++; } @@ -1693,14 +1724,28 @@ compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, for (i = 0; i < br->n_ports; i++) { struct port *port = br->ports[i]; if (port_includes_vlan(port, m->out_vlan) - && set_dst(dst, flow, in_port, port, tags) - && !dst_is_duplicate(dsts, dst - dsts, dst)) + && set_dst(dst, flow, in_port, port, tags)) { + int flow_vlan; + if (port->vlan < 0) { dst->vlan = m->out_vlan; } - if (dst->dp_ifidx == flow->in_port - && dst->vlan == vlan) { + if (dst_is_duplicate(dsts, dst - dsts, dst)) { + continue; + } + + /* Use the vlan tag on the original flow instead of + * the one passed in the vlan parameter. This ensures + * that we compare the vlan from before any implicit + * tagging tags place. This is necessary because + * dst->vlan is the final vlan, after removing implicit + * tags. */ + flow_vlan = ntohs(flow->dl_vlan); + if (flow_vlan == 0) { + flow_vlan = OFP_VLAN_NONE; + } + if (port == in_port && dst->vlan == flow_vlan) { /* Don't send out input port on same VLAN. */ continue; } @@ -1730,14 +1775,16 @@ print_dsts(const struct dst *dsts, size_t n) static void compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan, const struct port *in_port, const struct port *out_port, - tag_type *tags, struct odp_actions *actions) + tag_type *tags, struct odp_actions *actions, + uint16_t *nf_output_iface) { struct dst dsts[DP_MAX_PORTS * (MAX_MIRRORS + 1)]; size_t n_dsts; const struct dst *p; uint16_t cur_vlan; - n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags); + n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags, + nf_output_iface); cur_vlan = ntohs(flow->dl_vlan); for (p = dsts; p < &dsts[n_dsts]; p++) { @@ -1757,13 +1804,11 @@ compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan, } static bool -is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet) +is_bcast_arp_reply(const flow_t *flow) { - struct arp_eth_header *arp = (struct arp_eth_header *) packet->data; return (flow->dl_type == htons(ETH_TYPE_ARP) - && eth_addr_is_broadcast(flow->dl_dst) - && packet->size >= sizeof(struct arp_eth_header) - && arp->ar_op == ARP_OP_REQUEST); + && flow->nw_proto == ARP_OP_REPLY + && eth_addr_is_broadcast(flow->dl_dst)); } /* If the composed actions may be applied to any packet in the given 'flow', @@ -1772,7 +1817,7 @@ is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet) static bool process_flow(struct bridge *br, const flow_t *flow, const struct ofpbuf *packet, struct odp_actions *actions, - tag_type *tags) + tag_type *tags, uint16_t *nf_output_iface) { struct iface *in_iface; struct port *in_port; @@ -1863,33 +1908,27 @@ process_flow(struct bridge *br, const flow_t *flow, goto done; } - /* Multicast (and broadcast) packets on bonds need special attention, to - * avoid receiving duplicates. */ - if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) { - *tags |= in_port->active_iface_tag; - if (in_port->active_iface != in_iface->port_ifidx) { - /* Drop all multicast packets on inactive slaves. */ - goto done; - } else { - /* Drop all multicast packets for which we have learned a different - * input port, because we probably sent the packet on one slaves - * and got it back on the active slave. Broadcast ARP replies are - * an exception to this rule: the host has moved to another - * switch. */ - int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan); - if (src_idx != -1 && src_idx != in_port->port_idx) { - if (packet) { - if (!is_bcast_arp_reply(flow, packet)) { - goto done; - } - } else { - /* No way to know whether it's an ARP reply, because the - * flow entry doesn't include enough information and we - * don't have a packet. Punt. */ - return false; - } + /* Packets received on bonds need special attention to avoid duplicates. */ + if (in_port->n_ifaces > 1) { + int src_idx; + + if (eth_addr_is_multicast(flow->dl_dst)) { + *tags |= in_port->active_iface_tag; + if (in_port->active_iface != in_iface->port_ifidx) { + /* Drop all multicast packets on inactive slaves. */ + goto done; } } + + /* Drop all packets for which we have learned a different input + * port, because we probably sent the packet on one slave and got + * it back on the other. Broadcast ARP replies are an exception + * to this rule: the host has moved to another switch. */ + src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan); + if (src_idx != -1 && src_idx != in_port->port_idx && + !is_bcast_arp_reply(flow)) { + goto done; + } } /* MAC learning. */ @@ -1919,6 +1958,11 @@ process_flow(struct bridge *br, const flow_t *flow, tags); if (out_port_idx >= 0 && out_port_idx < br->n_ports) { out_port = br->ports[out_port_idx]; + } else if (!packet) { + /* If we are revalidating but don't have a learning entry then + * eject the flow. Installing a flow that floods packets will + * prevent us from seeing future packets and learning properly. */ + return false; } } @@ -1929,17 +1973,10 @@ process_flow(struct bridge *br, const flow_t *flow, } done: - compose_actions(br, flow, vlan, in_port, out_port, tags, actions); + compose_actions(br, flow, vlan, in_port, out_port, tags, actions, + nf_output_iface); - /* - * We send out only a single packet, instead of setting up a flow, if the - * packet is an ARP directed to broadcast that arrived on a bonded - * interface. In such a situation ARP requests and replies must be handled - * differently, but OpenFlow unfortunately can't distinguish them. - */ - return (in_port->n_ifaces < 2 - || flow->dl_type != htons(ETH_TYPE_ARP) - || !eth_addr_is_broadcast(flow->dl_dst)); + return true; } /* Careful: 'opp' is in host byte order and opp->port_no is an OFP port @@ -1982,7 +2019,8 @@ bridge_port_changed_ofhook_cb(enum ofp_port_reason reason, static bool bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet, - struct odp_actions *actions, tag_type *tags, void *br_) + struct odp_actions *actions, tag_type *tags, + uint16_t *nf_output_iface, void *br_) { struct bridge *br = br_; @@ -1995,7 +2033,7 @@ bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet, #endif COVERAGE_INC(bridge_process_flow); - return process_flow(br, flow, packet, actions, tags); + return process_flow(br, flow, packet, actions, tags, nf_output_iface); } static void @@ -2170,8 +2208,9 @@ log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port) /* Shifts 'hash' from 'from' to 'to' within 'port'. */ static void bond_shift_load(struct slave_balance *from, struct slave_balance *to, - struct bond_entry *hash) + int hash_idx) { + struct bond_entry *hash = from->hashes[hash_idx]; struct port *port = from->iface->port; uint64_t delta = hash->tx_bytes; @@ -2189,12 +2228,11 @@ bond_shift_load(struct slave_balance *from, struct slave_balance *to, * it require more work, the only purpose it would be to allow that hash to * be migrated to another slave in this rebalancing run, and there is no * point in doing that. */ - if (from->hashes[0] == hash) { + if (hash_idx == 0) { from->hashes++; } else { - int i = hash - from->hashes[0]; - memmove(from->hashes + i, from->hashes + i + 1, - (from->n_hashes - (i + 1)) * sizeof *from->hashes); + memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1, + (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes); } from->n_hashes--; @@ -2279,22 +2317,60 @@ bond_rebalance_port(struct port *port) /* 'from' is carrying significantly more load than 'to', and that * load is split across at least two different hashes. Pick a hash * to migrate to 'to' (the least-loaded slave), given that doing so - * must not cause 'to''s load to exceed 'from''s load. + * must decrease the ratio of the load on the two slaves by at + * least 0.1. * * The sort order we use means that we prefer to shift away the * smallest hashes instead of the biggest ones. There is little * reason behind this decision; we could use the opposite sort * order to shift away big hashes ahead of small ones. */ size_t i; + bool order_swapped; for (i = 0; i < from->n_hashes; i++) { + double old_ratio, new_ratio; uint64_t delta = from->hashes[i]->tx_bytes; - if (to->tx_bytes + delta < from->tx_bytes - delta) { + + if (delta == 0 || from->tx_bytes - delta == 0) { + /* Pointless move. */ + continue; + } + + order_swapped = from->tx_bytes - delta < to->tx_bytes + delta; + + if (to->tx_bytes == 0) { + /* Nothing on the new slave, move it. */ + break; + } + + old_ratio = (double)from->tx_bytes / to->tx_bytes; + new_ratio = (double)(from->tx_bytes - delta) / + (to->tx_bytes + delta); + + if (new_ratio == 0) { + /* Should already be covered but check to prevent division + * by zero. */ + continue; + } + + if (new_ratio < 1) { + new_ratio = 1 / new_ratio; + } + + if (old_ratio - new_ratio > 0.1) { + /* Would decrease the ratio, move it. */ break; } } if (i < from->n_hashes) { - bond_shift_load(from, to, from->hashes[i]); + bond_shift_load(from, to, i); + port->bond_compat_is_stale = true; + + /* If the result of the migration changed the relative order of + * 'from' and 'to' swap them back to maintain invariants. */ + if (order_swapped) { + swap_bals(from, to); + } /* Re-sort 'bals'. Note that this may make 'from' and 'to' * point to different slave_balance structures. It is only @@ -2305,7 +2381,6 @@ bond_rebalance_port(struct port *port) } else { from++; } - port->bond_compat_is_stale = true; } } @@ -2331,10 +2406,7 @@ bond_send_learning_packets(struct port *port) ofpbuf_init(&packet, 128); error = n_packets = n_errors = 0; LIST_FOR_EACH (e, struct mac_entry, lru_node, &br->ml->lrus) { - static const char s[] = "Open vSwitch Bond Failover"; union ofp_action actions[2], *a; - struct eth_header *eth; - struct llc_snap_header *llc_snap; uint16_t dp_ifidx; tag_type tags = 0; flow_t flow; @@ -2345,23 +2417,6 @@ bond_send_learning_packets(struct port *port) continue; } - /* Compose packet to send. */ - ofpbuf_clear(&packet); - eth = ofpbuf_put_zeros(&packet, ETH_HEADER_LEN); - llc_snap = ofpbuf_put_zeros(&packet, LLC_SNAP_HEADER_LEN); - ofpbuf_put(&packet, s, sizeof s); /* Includes null byte. */ - ofpbuf_put(&packet, e->mac, ETH_ADDR_LEN); - - memcpy(eth->eth_dst, eth_addr_broadcast, ETH_ADDR_LEN); - memcpy(eth->eth_src, e->mac, ETH_ADDR_LEN); - eth->eth_type = htons(packet.size - ETH_HEADER_LEN); - - llc_snap->llc.llc_dsap = LLC_DSAP_SNAP; - llc_snap->llc.llc_ssap = LLC_SSAP_SNAP; - llc_snap->llc.llc_cntl = LLC_CNTL_SNAP; - memcpy(llc_snap->snap.snap_org, "\x00\x23\x20", 3); - llc_snap->snap.snap_type = htons(0xf177); /* Random number. */ - /* Compose actions. */ memset(actions, 0, sizeof actions); a = actions; @@ -2378,6 +2433,8 @@ bond_send_learning_packets(struct port *port) /* Send packet. */ n_packets++; + compose_benign_packet(&packet, "Open vSwitch Bond Failover", 0xf177, + e->mac); flow_extract(&packet, ODPP_NONE, &flow); retval = ofproto_send_packet(br->ofproto, &flow, actions, a - actions, &packet); @@ -2958,14 +3015,40 @@ port_update_bond_compat(struct port *port) struct iface *iface = port->ifaces[i]; struct compat_bond_slave *slave = &bond.slaves[i]; slave->name = iface->name; - slave->up = ((iface->enabled && iface->delay_expires == LLONG_MAX) || - (!iface->enabled && iface->delay_expires != LLONG_MAX)); + + /* We need to make the same determination as the Linux bonding + * code to determine whether a slave should be consider "up". + * The Linux function bond_miimon_inspect() supports four + * BOND_LINK_* states: + * + * - BOND_LINK_UP: carrier detected, updelay has passed. + * - BOND_LINK_FAIL: carrier lost, downdelay in progress. + * - BOND_LINK_DOWN: carrier lost, downdelay has passed. + * - BOND_LINK_BACK: carrier detected, updelay in progress. + * + * The function bond_info_show_slave() only considers BOND_LINK_UP + * to be "up" and anything else to be "down". + */ + slave->up = iface->enabled && iface->delay_expires == LLONG_MAX; if (slave->up) { bond.up = true; } memcpy(slave->mac, iface->mac, ETH_ADDR_LEN); } + if (cfg_get_bool(0, "bonding.%s.fake-iface", port->name)) { + struct netdev *bond_netdev; + + if (!netdev_open(port->name, NETDEV_ETH_TYPE_NONE, &bond_netdev)) { + if (bond.up) { + netdev_turn_flags_on(bond_netdev, NETDEV_UP, true); + } else { + netdev_turn_flags_off(bond_netdev, NETDEV_UP, true); + } + netdev_close(bond_netdev); + } + } + proc_net_compat_update_bond(port->name, &bond); free(bond.slaves); } @@ -3100,6 +3183,60 @@ iface_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx) { return port_array_get(&br->ifaces, dp_ifidx); } + +/* Returns true if 'iface' is the name of an "internal" interface on bridge + * 'br', that is, an interface that is entirely simulated within the datapath. + * The local port (ODPP_LOCAL) is always an internal interface. Other local + * interfaces are created by setting "iface..internal = true". + * + * In addition, we have a kluge-y feature that creates an internal port with + * the name of a bonded port if "bonding..fake-iface = true" is set. + * This feature needs to go away in the long term. Until then, this is one + * reason why this function takes a name instead of a struct iface: the fake + * interfaces created this way do not have a struct iface. */ +static bool +iface_is_internal(const struct bridge *br, const char *iface) +{ + if (!strcmp(iface, br->name) + || cfg_get_bool(0, "iface.%s.internal", iface)) { + return true; + } + + if (cfg_get_bool(0, "bonding.%s.fake-iface", iface)) { + struct port *port = port_lookup(br, iface); + if (port && port->n_ifaces > 1) { + return true; + } + } + + return false; +} + +/* Set Ethernet address of 'iface', if one is specified in the configuration + * file. */ +static void +iface_set_mac(struct iface *iface) +{ + uint64_t mac = cfg_get_mac(0, "iface.%s.mac", iface->name); + if (mac) { + static uint8_t ea[ETH_ADDR_LEN]; + + eth_addr_from_uint64(mac, ea); + if (eth_addr_is_multicast(ea)) { + VLOG_ERR("interface %s: cannot set MAC to multicast address", + iface->name); + } else if (iface->dp_ifidx == ODPP_LOCAL) { + VLOG_ERR("ignoring iface.%s.mac; use bridge.%s.mac instead", + iface->name, iface->name); + } else { + int error = netdev_nodev_set_etheraddr(iface->name, ea); + if (error) { + VLOG_ERR("interface %s: setting MAC failed (%s)", + iface->name, strerror(error)); + } + } + } +} /* Port mirroring. */ @@ -3300,6 +3437,7 @@ mirror_reconfigure_one(struct mirror *m) int *vlans; size_t i; bool mirror_all_ports; + bool any_ports_specified; /* Get output port. */ out_port_name = cfg_get_key(0, "mirror.%s.%s.output.port", @@ -3338,11 +3476,18 @@ mirror_reconfigure_one(struct mirror *m) cfg_get_all_keys(&src_ports, "%s.select.src-port", pfx); cfg_get_all_keys(&dst_ports, "%s.select.dst-port", pfx); cfg_get_all_keys(&ports, "%s.select.port", pfx); + any_ports_specified = src_ports.n || dst_ports.n || ports.n; svec_append(&src_ports, &ports); svec_append(&dst_ports, &ports); svec_destroy(&ports); prune_ports(m, &src_ports); prune_ports(m, &dst_ports); + if (any_ports_specified && !src_ports.n && !dst_ports.n) { + VLOG_ERR("%s: none of the specified ports exist; " + "disabling port mirror %s", pfx, pfx); + mirror_destroy(m); + goto exit; + } /* Get all the vlans, and drop duplicate and invalid vlans. */ svec_init(&vlan_strings); @@ -3394,6 +3539,7 @@ mirror_reconfigure_one(struct mirror *m) } /* Clean up. */ +exit: svec_destroy(&src_ports); svec_destroy(&dst_ports); free(pfx);