-/* Bridge packet processing functions. */
-
-static bool
-bond_is_tcp_hash(const struct port *port)
-{
- return port->bond_mode == BM_TCP && port->lacp & LACP_NEGOTIATED;
-}
-
-static int
-bond_hash_src(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan)
-{
- return hash_bytes(mac, ETH_ADDR_LEN, vlan) & BOND_MASK;
-}
-
-static int bond_hash_tcp(const struct flow *flow, uint16_t vlan)
-{
- struct flow hash_flow;
-
- memcpy(&hash_flow, flow, sizeof hash_flow);
- hash_flow.vlan_tci = 0;
-
- /* The symmetric quality of this hash function is not required, but
- * flow_hash_symmetric_l4 already exists, and is sufficient for our
- * purposes, so we use it out of convenience. */
- return flow_hash_symmetric_l4(&hash_flow, vlan) & BOND_MASK;
-}
-
-static struct bond_entry *
-lookup_bond_entry(const struct port *port, const struct flow *flow,
- uint16_t vlan)
-{
- assert(port->bond_mode != BM_AB);
-
- if (bond_is_tcp_hash(port)) {
- return &port->bond_hash[bond_hash_tcp(flow, vlan)];
- } else {
- return &port->bond_hash[bond_hash_src(flow->dl_src, vlan)];
- }
-}
-
-static int
-bond_choose_iface(const struct port *port)
-{
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
- size_t i, best_down_slave = -1;
- long long next_delay_expiration = LLONG_MAX;
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
-
- if (iface->enabled) {
- return i;
- } else if (iface->delay_expires < next_delay_expiration
- && (iface->lacp_status & LACP_ATTACHED
- || !(port->lacp & LACP_NEGOTIATED))) {
- best_down_slave = i;
- next_delay_expiration = iface->delay_expires;
- }
- }
-
- if (best_down_slave != -1) {
- struct iface *iface = port->ifaces[best_down_slave];
-
- VLOG_INFO_RL(&rl, "interface %s: skipping remaining %lli ms updelay "
- "since no other interface is up", iface->name,
- iface->delay_expires - time_msec());
- bond_enable_slave(iface, true);
- }
-
- return best_down_slave;
-}
-
-static bool
-choose_output_iface(const struct port *port, const struct flow *flow,
- uint16_t vlan, uint16_t *dp_ifidx, tag_type *tags)
-{
- struct iface *iface;
-
- assert(port->n_ifaces);
- if (port->n_ifaces == 1) {
- iface = port->ifaces[0];
- } else if (port->bond_mode == BM_AB) {
- if (port->active_iface < 0) {
- *tags |= port->no_ifaces_tag;
- return false;
- }
- iface = port->ifaces[port->active_iface];
- } else {
- struct bond_entry *e = lookup_bond_entry(port, flow, vlan);
- if (e->iface_idx < 0 || e->iface_idx >= port->n_ifaces
- || !port->ifaces[e->iface_idx]->enabled) {
- /* XXX select interface properly. The current interface selection
- * is only good for testing the rebalancing code. */
- e->iface_idx = bond_choose_iface(port);
- if (e->iface_idx < 0) {
- *tags |= port->no_ifaces_tag;
- return false;
- }
- e->iface_tag = tag_create_random();
- }
- *tags |= e->iface_tag;
- iface = port->ifaces[e->iface_idx];
- }
- *dp_ifidx = iface->dp_ifidx;
- *tags |= iface->tag; /* Currently only used for bonding. */
- return true;
-}
-
-static void
-bond_link_status_update(struct iface *iface)
-{
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
- struct port *port = iface->port;
- bool up = iface->up;
- int updelay, downdelay;
-
- updelay = port->updelay;
- downdelay = port->downdelay;
-
- if (iface->port->lacp & LACP_NEGOTIATED) {
- downdelay = 0;
- updelay = 0;
- }
-
- if (iface->port->lacp && up) {
- /* The interface is up if it's attached to an aggregator and its
- * partner is synchronized. The only exception is defaulted links.
- * They are not required to have synchronized partners because they
- * have no partners at all. However, they will only be attached if
- * negotiations failed on all interfaces in the bond. */
- up = iface->lacp_status & LACP_ATTACHED
- && (iface->lacp_partner.state & LACP_STATE_SYNC
- || iface->lacp_status & LACP_DEFAULTED);
- }
-
-
- if ((up == iface->enabled) == (iface->delay_expires == LLONG_MAX)) {
- /* Nothing to do. */
- return;
- }
- VLOG_INFO_RL(&rl, "interface %s: link state %s",
- iface->name, up ? "up" : "down");
- if (up == iface->enabled) {
- iface->delay_expires = LLONG_MAX;
- VLOG_INFO_RL(&rl, "interface %s: will not be %s",
- iface->name, up ? "disabled" : "enabled");
- } else if (up && port->active_iface < 0) {
- bond_enable_slave(iface, true);
- if (updelay) {
- VLOG_INFO_RL(&rl, "interface %s: skipping %d ms updelay since no "
- "other interface is up", iface->name, updelay);
- }
- } else {
- int delay = up ? updelay : downdelay;
- iface->delay_expires = time_msec() + delay;
- if (delay) {
- VLOG_INFO_RL(&rl,
- "interface %s: will be %s if it stays %s for %d ms",
- iface->name,
- up ? "enabled" : "disabled",
- up ? "up" : "down",
- delay);
- }
- }
-}
-
-static void
-bond_choose_active_iface(struct port *port)
-{
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
-
- port->active_iface = bond_choose_iface(port);
- port->active_iface_tag = tag_create_random();
- if (port->active_iface >= 0) {
- VLOG_INFO_RL(&rl, "port %s: active interface is now %s",
- port->name, port->ifaces[port->active_iface]->name);
- } else {
- VLOG_WARN_RL(&rl, "port %s: all ports disabled, no active interface",
- port->name);
- }
-}
-
-static void
-bond_enable_slave(struct iface *iface, bool enable)
-{
- struct port *port = iface->port;
- struct bridge *br = port->bridge;
-
- /* This acts as a recursion check. If the act of disabling a slave
- * causes a different slave to be enabled, the flag will allow us to
- * skip redundant work when we reenter this function. It must be
- * cleared on exit to keep things safe with multiple bonds. */
- static bool moving_active_iface = false;
-
- iface->delay_expires = LLONG_MAX;
- if (enable == iface->enabled) {
- return;
- }
-
- iface->enabled = enable;
- if (!iface->enabled) {
- VLOG_WARN("interface %s: disabled", iface->name);
- ofproto_revalidate(br->ofproto, iface->tag);
- if (iface->port_ifidx == port->active_iface) {
- ofproto_revalidate(br->ofproto,
- port->active_iface_tag);
-
- /* Disabling a slave can lead to another slave being immediately
- * enabled if there will be no active slaves but one is waiting
- * on an updelay. In this case we do not need to run most of the
- * code for the newly enabled slave since there was no period
- * without an active slave and it is redundant with the disabling
- * path. */
- moving_active_iface = true;
- bond_choose_active_iface(port);
- }
- bond_send_learning_packets(port);
- } else {
- VLOG_WARN("interface %s: enabled", iface->name);
- if (port->active_iface < 0 && !moving_active_iface) {
- ofproto_revalidate(br->ofproto, port->no_ifaces_tag);
- bond_choose_active_iface(port);
- bond_send_learning_packets(port);
- }
- iface->tag = tag_create_random();
- }
-
- moving_active_iface = false;
-}
-
-/* Attempts to make the sum of the bond slaves' statistics appear on the fake
- * bond interface. */
-static void
-bond_update_fake_iface_stats(struct port *port)
-{
- struct netdev_stats bond_stats;
- struct netdev *bond_dev;
- size_t i;
-
- memset(&bond_stats, 0, sizeof bond_stats);
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct netdev_stats slave_stats;
-
- if (!netdev_get_stats(port->ifaces[i]->netdev, &slave_stats)) {
- /* XXX: We swap the stats here because they are swapped back when
- * reported by the internal device. The reason for this is
- * internal devices normally represent packets going into the system
- * but when used as fake bond device they represent packets leaving
- * the system. We really should do this in the internal device
- * itself because changing it here reverses the counts from the
- * perspective of the switch. However, the internal device doesn't
- * know what type of device it represents so we have to do it here
- * for now. */
- bond_stats.tx_packets += slave_stats.rx_packets;
- bond_stats.tx_bytes += slave_stats.rx_bytes;
- bond_stats.rx_packets += slave_stats.tx_packets;
- bond_stats.rx_bytes += slave_stats.tx_bytes;
- }
- }
-
- if (!netdev_open_default(port->name, &bond_dev)) {
- netdev_set_stats(bond_dev, &bond_stats);
- netdev_close(bond_dev);
- }
-}
-
-static void
-bond_run(struct port *port)
-{
- size_t i;
-
- if (port->n_ifaces < 2) {
- return;
- }
-
- for (i = 0; i < port->n_ifaces; i++) {
- bond_link_status_update(port->ifaces[i]);
- }
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
- if (time_msec() >= iface->delay_expires) {
- bond_enable_slave(iface, !iface->enabled);
- }
- }
-
- if (port->bond_fake_iface
- && time_msec() >= port->bond_next_fake_iface_update) {
- bond_update_fake_iface_stats(port);
- port->bond_next_fake_iface_update = time_msec() + 1000;
- }
-}
-
-static void
-bond_wait(struct port *port)
-{
- size_t i;
-
- if (port->n_ifaces < 2) {
- return;
- }
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
- if (iface->delay_expires != LLONG_MAX) {
- poll_timer_wait_until(iface->delay_expires);
- }
- }
-
- if (port->bond_fake_iface) {
- poll_timer_wait_until(port->bond_next_fake_iface_update);
- }
-}
-
-static bool
-set_dst(struct dst *dst, const struct flow *flow,
- const struct port *in_port, const struct port *out_port,
- tag_type *tags)
-{
- dst->vlan = (out_port->vlan >= 0 ? OFP_VLAN_NONE
- : in_port->vlan >= 0 ? in_port->vlan
- : flow->vlan_tci == 0 ? OFP_VLAN_NONE
- : vlan_tci_to_vid(flow->vlan_tci));
- return choose_output_iface(out_port, flow, dst->vlan,
- &dst->dp_ifidx, tags);
-}
-
-static void
-swap_dst(struct dst *p, struct dst *q)
-{
- struct dst tmp = *p;
- *p = *q;
- *q = tmp;
-}
-
-/* Moves all the dsts with vlan == 'vlan' to the front of the 'n_dsts' in
- * 'dsts'. (This may help performance by reducing the number of VLAN changes
- * that we push to the datapath. We could in fact fully sort the array by
- * vlan, but in most cases there are at most two different vlan tags so that's
- * possibly overkill.) */
-static void
-partition_dsts(struct dst_set *set, int vlan)
-{
- struct dst *first = set->dsts;
- struct dst *last = set->dsts + set->n;
-
- while (first != last) {
- /* Invariants:
- * - All dsts < first have vlan == 'vlan'.
- * - All dsts >= last have vlan != 'vlan'.
- * - first < last. */
- while (first->vlan == vlan) {
- if (++first == last) {
- return;
- }
- }
-
- /* Same invariants, plus one additional:
- * - first->vlan != vlan.
- */
- while (last[-1].vlan != vlan) {
- if (--last == first) {
- return;
- }
- }
-
- /* Same invariants, plus one additional:
- * - last[-1].vlan == vlan.*/
- swap_dst(first++, --last);
- }
-}
-
-static int
-mirror_mask_ffs(mirror_mask_t mask)
-{
- BUILD_ASSERT_DECL(sizeof(unsigned int) >= sizeof(mask));
- return ffs(mask);
-}
-
-static void
-dst_set_init(struct dst_set *set)
-{
- set->dsts = set->builtin;
- set->n = 0;
- set->allocated = ARRAY_SIZE(set->builtin);
-}
-
-static void
-dst_set_add(struct dst_set *set, const struct dst *dst)
-{
- if (set->n >= set->allocated) {
- size_t new_allocated;
- struct dst *new_dsts;
-
- new_allocated = set->allocated * 2;
- new_dsts = xmalloc(new_allocated * sizeof *new_dsts);
- memcpy(new_dsts, set->dsts, set->n * sizeof *new_dsts);
-
- dst_set_free(set);
-
- set->dsts = new_dsts;
- set->allocated = new_allocated;
- }
- set->dsts[set->n++] = *dst;
-}
-
-static void
-dst_set_free(struct dst_set *set)
-{
- if (set->dsts != set->builtin) {
- free(set->dsts);
- }
-}
-
-static bool
-dst_is_duplicate(const struct dst_set *set, const struct dst *test)
-{
- size_t i;
- for (i = 0; i < set->n; i++) {
- if (set->dsts[i].vlan == test->vlan
- && set->dsts[i].dp_ifidx == test->dp_ifidx) {
- return true;
- }
- }
- return false;
-}
-
-static bool
-port_trunks_vlan(const struct port *port, uint16_t vlan)
-{
- return (port->vlan < 0
- && (!port->trunks || bitmap_is_set(port->trunks, vlan)));
-}
-
-static bool
-port_includes_vlan(const struct port *port, uint16_t vlan)
-{
- return vlan == port->vlan || port_trunks_vlan(port, vlan);
-}
-
-static bool
-port_is_floodable(const struct port *port)
-{
- int i;
-
- for (i = 0; i < port->n_ifaces; i++) {
- if (!ofproto_port_is_floodable(port->bridge->ofproto,
- port->ifaces[i]->dp_ifidx)) {
- return false;
- }
- }
- return true;
-}
-
-static void
-compose_dsts(const struct bridge *br, const struct flow *flow, uint16_t vlan,
- const struct port *in_port, const struct port *out_port,
- struct dst_set *set, tag_type *tags, uint16_t *nf_output_iface)
-{
- mirror_mask_t mirrors = in_port->src_mirrors;
- struct dst dst;
- int flow_vlan;
- size_t i;
-
- flow_vlan = vlan_tci_to_vid(flow->vlan_tci);
- if (flow_vlan == 0) {
- flow_vlan = OFP_VLAN_NONE;
- }
-
- if (out_port == FLOOD_PORT) {
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- if (port != in_port
- && port_is_floodable(port)
- && port_includes_vlan(port, vlan)
- && !port->is_mirror_output_port
- && set_dst(&dst, flow, in_port, port, tags)) {
- mirrors |= port->dst_mirrors;
- dst_set_add(set, &dst);
- }
- }
- *nf_output_iface = NF_OUT_FLOOD;
- } else if (out_port && set_dst(&dst, flow, in_port, out_port, tags)) {
- dst_set_add(set, &dst);
- *nf_output_iface = dst.dp_ifidx;
- mirrors |= out_port->dst_mirrors;
- }
-
- while (mirrors) {
- struct mirror *m = br->mirrors[mirror_mask_ffs(mirrors) - 1];
- if (!m->n_vlans || vlan_is_mirrored(m, vlan)) {
- if (m->out_port) {
- if (set_dst(&dst, flow, in_port, m->out_port, tags)
- && !dst_is_duplicate(set, &dst)) {
- dst_set_add(set, &dst);
- }
- } else {
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- if (port_includes_vlan(port, m->out_vlan)
- && set_dst(&dst, flow, in_port, port, tags))
- {
- if (port->vlan < 0) {
- dst.vlan = m->out_vlan;
- }
- if (dst_is_duplicate(set, &dst)) {
- continue;
- }
-
- /* Use the vlan tag on the original flow instead of
- * the one passed in the vlan parameter. This ensures
- * that we compare the vlan from before any implicit
- * tagging tags place. This is necessary because
- * dst->vlan is the final vlan, after removing implicit
- * tags. */
- if (port == in_port && dst.vlan == flow_vlan) {
- /* Don't send out input port on same VLAN. */
- continue;
- }
- dst_set_add(set, &dst);
- }
- }
- }
- }
- mirrors &= mirrors - 1;
- }
-
- partition_dsts(set, flow_vlan);
-}
-
-static void OVS_UNUSED
-print_dsts(const struct dst_set *set)
-{
- size_t i;
-
- for (i = 0; i < set->n; i++) {
- const struct dst *dst = &set->dsts[i];
-
- printf(">p%"PRIu16, dst->dp_ifidx);
- if (dst->vlan != OFP_VLAN_NONE) {
- printf("v%"PRIu16, dst->vlan);
- }
- }
-}
-
-static void
-compose_actions(struct bridge *br, const struct flow *flow, uint16_t vlan,
- const struct port *in_port, const struct port *out_port,
- tag_type *tags, struct ofpbuf *actions,
- uint16_t *nf_output_iface)
-{
- struct dst_set set;
- uint16_t cur_vlan;
- size_t i;
-
- dst_set_init(&set);
- compose_dsts(br, flow, vlan, in_port, out_port, &set, tags,
- nf_output_iface);
-
- cur_vlan = vlan_tci_to_vid(flow->vlan_tci);
- if (cur_vlan == 0) {
- cur_vlan = OFP_VLAN_NONE;
- }
- for (i = 0; i < set.n; i++) {
- const struct dst *dst = &set.dsts[i];
- if (dst->vlan != cur_vlan) {
- if (dst->vlan == OFP_VLAN_NONE) {
- nl_msg_put_flag(actions, ODP_ACTION_ATTR_STRIP_VLAN);
- } else {
- ovs_be16 tci;
- tci = htons(dst->vlan & VLAN_VID_MASK);
- tci |= flow->vlan_tci & htons(VLAN_PCP_MASK);
- nl_msg_put_be16(actions, ODP_ACTION_ATTR_SET_DL_TCI, tci);
- }
- cur_vlan = dst->vlan;
- }
- nl_msg_put_u32(actions, ODP_ACTION_ATTR_OUTPUT, dst->dp_ifidx);
- }
- dst_set_free(&set);
-}
-
-/* Returns the effective vlan of a packet, taking into account both the
- * 802.1Q header and implicitly tagged ports. A value of 0 indicates that
- * the packet is untagged and -1 indicates it has an invalid header and
- * should be dropped. */
-static int flow_get_vlan(struct bridge *br, const struct flow *flow,
- struct port *in_port, bool have_packet)
-{
- int vlan = vlan_tci_to_vid(flow->vlan_tci);
- if (in_port->vlan >= 0) {
- if (vlan) {
- /* XXX support double tagging? */
- if (have_packet) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged "
- "packet received on port %s configured with "
- "implicit VLAN %"PRIu16,
- br->name, vlan, in_port->name, in_port->vlan);
- }
- return -1;
- }
- vlan = in_port->vlan;
- } else {
- if (!port_includes_vlan(in_port, vlan)) {
- if (have_packet) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged "
- "packet received on port %s not configured for "
- "trunking VLAN %d",
- br->name, vlan, in_port->name, vlan);
- }
- return -1;
- }
- }
-
- return vlan;
-}
-
-/* A VM broadcasts a gratuitous ARP to indicate that it has resumed after
- * migration. Older Citrix-patched Linux DomU used gratuitous ARP replies to
- * indicate this; newer upstream kernels use gratuitous ARP requests. */
-static bool
-is_gratuitous_arp(const struct flow *flow)
-{
- return (flow->dl_type == htons(ETH_TYPE_ARP)
- && eth_addr_is_broadcast(flow->dl_dst)
- && (flow->nw_proto == ARP_OP_REPLY
- || (flow->nw_proto == ARP_OP_REQUEST
- && flow->nw_src == flow->nw_dst)));
-}
-
-static void
-update_learning_table(struct bridge *br, const struct flow *flow, int vlan,
- struct port *in_port)
-{
- enum grat_arp_lock_type lock_type;
- tag_type rev_tag;
-
- /* We don't want to learn from gratuitous ARP packets that are reflected
- * back over bond slaves so we lock the learning table. */
- lock_type = !is_gratuitous_arp(flow) ? GRAT_ARP_LOCK_NONE :
- (in_port->n_ifaces == 1) ? GRAT_ARP_LOCK_SET :
- GRAT_ARP_LOCK_CHECK;
-
- rev_tag = mac_learning_learn(br->ml, flow->dl_src, vlan, in_port->port_idx,
- lock_type);
- if (rev_tag) {
- /* The log messages here could actually be useful in debugging,
- * so keep the rate limit relatively high. */
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30,
- 300);
- VLOG_DBG_RL(&rl, "bridge %s: learned that "ETH_ADDR_FMT" is "
- "on port %s in VLAN %d",
- br->name, ETH_ADDR_ARGS(flow->dl_src),
- in_port->name, vlan);
- ofproto_revalidate(br->ofproto, rev_tag);
- }
-}
-
-/* Determines whether packets in 'flow' within 'br' should be forwarded or
- * dropped. Returns true if they may be forwarded, false if they should be
- * dropped.
- *
- * If 'have_packet' is true, it indicates that the caller is processing a
- * received packet. If 'have_packet' is false, then the caller is just
- * revalidating an existing flow because configuration has changed. Either
- * way, 'have_packet' only affects logging (there is no point in logging errors
- * during revalidation).
- *
- * Sets '*in_portp' to the input port. This will be a null pointer if
- * flow->in_port does not designate a known input port (in which case
- * is_admissible() returns false).
- *
- * When returning true, sets '*vlanp' to the effective VLAN of the input
- * packet, as returned by flow_get_vlan().
- *
- * May also add tags to '*tags', although the current implementation only does
- * so in one special case.
- */
-static bool
-is_admissible(struct bridge *br, const struct flow *flow, bool have_packet,
- tag_type *tags, int *vlanp, struct port **in_portp)
-{
- struct iface *in_iface;
- struct port *in_port;
- int vlan;
-
- /* Find the interface and port structure for the received packet. */
- in_iface = iface_from_dp_ifidx(br, flow->in_port);
- if (!in_iface) {
- /* No interface? Something fishy... */
- if (have_packet) {
- /* Odd. A few possible reasons here:
- *
- * - We deleted an interface but there are still a few packets
- * queued up from it.
- *
- * - Someone externally added an interface (e.g. with "ovs-dpctl
- * add-if") that we don't know about.
- *
- * - Packet arrived on the local port but the local port is not
- * one of our bridge ports.
- */
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
-
- VLOG_WARN_RL(&rl, "bridge %s: received packet on unknown "
- "interface %"PRIu16, br->name, flow->in_port);
- }
-
- *in_portp = NULL;
- return false;
- }
- *in_portp = in_port = in_iface->port;
- *vlanp = vlan = flow_get_vlan(br, flow, in_port, have_packet);
- if (vlan < 0) {
- return false;
- }
-
- /* Drop frames for reserved multicast addresses. */
- if (eth_addr_is_reserved(flow->dl_dst)) {
- return false;
- }
-
- /* Drop frames on ports reserved for mirroring. */
- if (in_port->is_mirror_output_port) {
- if (have_packet) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_WARN_RL(&rl, "bridge %s: dropping packet received on port "
- "%s, which is reserved exclusively for mirroring",
- br->name, in_port->name);
- }
- return false;
- }
-
- /* When using LACP, do not accept packets from disabled interfaces. */
- if (in_port->lacp & LACP_NEGOTIATED && !in_iface->enabled) {
- return false;
- }
-
- /* Packets received on non-LACP bonds need special attention to avoid
- * duplicates. */
- if (in_port->n_ifaces > 1 && !(in_port->lacp & LACP_NEGOTIATED)) {
- int src_idx;
- bool is_grat_arp_locked;
-
- if (eth_addr_is_multicast(flow->dl_dst)) {
- *tags |= in_port->active_iface_tag;
- if (in_port->active_iface != in_iface->port_ifidx) {
- /* Drop all multicast packets on inactive slaves. */
- return false;
- }
- }
-
- /* Drop all packets for which we have learned a different input
- * port, because we probably sent the packet on one slave and got
- * it back on the other. Gratuitous ARP packets are an exception
- * to this rule: the host has moved to another switch. The exception
- * to the exception is if we locked the learning table to avoid
- * reflections on bond slaves. If this is the case, just drop the
- * packet now. */
- src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan,
- &is_grat_arp_locked);
- if (src_idx != -1 && src_idx != in_port->port_idx &&
- (!is_gratuitous_arp(flow) || is_grat_arp_locked)) {
- return false;
- }
- }
-
- return true;
-}
-
-/* If the composed actions may be applied to any packet in the given 'flow',
- * returns true. Otherwise, the actions should only be applied to 'packet', or
- * not at all, if 'packet' was NULL. */
-static bool
-process_flow(struct bridge *br, const struct flow *flow,
- const struct ofpbuf *packet, struct ofpbuf *actions,
- tag_type *tags, uint16_t *nf_output_iface)
-{
- struct port *in_port;
- struct port *out_port;
- int vlan;
- int out_port_idx;
-
- /* Check whether we should drop packets in this flow. */
- if (!is_admissible(br, flow, packet != NULL, tags, &vlan, &in_port)) {
- out_port = NULL;
- goto done;
- }
-
- /* Learn source MAC (but don't try to learn from revalidation). */
- if (packet) {
- update_learning_table(br, flow, vlan, in_port);
- }
-
- /* Determine output port. */
- out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan, tags,
- NULL);
- if (out_port_idx >= 0 && out_port_idx < br->n_ports) {
- out_port = br->ports[out_port_idx];
- } else if (!packet && !eth_addr_is_multicast(flow->dl_dst)) {
- /* If we are revalidating but don't have a learning entry then
- * eject the flow. Installing a flow that floods packets opens
- * up a window of time where we could learn from a packet reflected
- * on a bond and blackhole packets before the learning table is
- * updated to reflect the correct port. */
- return false;
- } else {
- out_port = FLOOD_PORT;
- }
-
- /* Don't send packets out their input ports. */
- if (in_port == out_port) {
- out_port = NULL;
- }
-
-done:
- if (in_port) {
- compose_actions(br, flow, vlan, in_port, out_port, tags, actions,
- nf_output_iface);
- }
-
- return true;
-}
-
-static bool
-bridge_normal_ofhook_cb(const struct flow *flow, const struct ofpbuf *packet,
- struct ofpbuf *actions, tag_type *tags,
- uint16_t *nf_output_iface, void *br_)
-{
- struct bridge *br = br_;
-
- COVERAGE_INC(bridge_process_flow);
- return process_flow(br, flow, packet, actions, tags, nf_output_iface);
-}
-
-static bool
-bridge_special_ofhook_cb(const struct flow *flow,
- const struct ofpbuf *packet, void *br_)
-{
- struct iface *iface;
- struct bridge *br = br_;
-
- iface = iface_from_dp_ifidx(br, flow->in_port);
-
- if (cfm_should_process_flow(flow)) {
-
- if (iface && packet && iface->cfm) {
- COVERAGE_INC(bridge_process_cfm);
- cfm_process_heartbeat(iface->cfm, packet);
- }
- return false;
- } else if (flow->dl_type == htons(ETH_TYPE_LACP)) {
-
- if (iface && packet) {
- COVERAGE_INC(bridge_process_lacp);
- lacp_process_packet(packet, iface);
- }
- return false;
- }
-
- return true;
-}
-
-static void
-bridge_account_flow_ofhook_cb(const struct flow *flow, tag_type tags,
- const struct nlattr *actions,
- size_t actions_len,
- uint64_t n_bytes, void *br_)
-{
- struct bridge *br = br_;
- const struct nlattr *a;
- struct port *in_port;
- tag_type dummy = 0;
- unsigned int left;
- int vlan;
-
- /* Feed information from the active flows back into the learning table to
- * ensure that table is always in sync with what is actually flowing
- * through the datapath.
- *
- * We test that 'tags' is nonzero to ensure that only flows that include an
- * OFPP_NORMAL action are used for learning. This works because
- * bridge_normal_ofhook_cb() always sets a nonzero tag value. */
- if (tags && is_admissible(br, flow, false, &dummy, &vlan, &in_port)) {
- update_learning_table(br, flow, vlan, in_port);
- }
-
- /* Account for bond slave utilization. */
- if (!br->has_bonded_ports) {
- return;
- }
- NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len) {
- if (nl_attr_type(a) == ODP_ACTION_ATTR_OUTPUT) {
- struct port *out_port = port_from_dp_ifidx(br, nl_attr_get_u32(a));
- if (out_port && out_port->n_ifaces >= 2 &&
- out_port->bond_mode != BM_AB) {
- uint16_t vlan = (flow->vlan_tci
- ? vlan_tci_to_vid(flow->vlan_tci)
- : OFP_VLAN_NONE);
- struct bond_entry *e = lookup_bond_entry(out_port, flow, vlan);
- e->tx_bytes += n_bytes;
- }
- }
- }
-}
-
-static void
-bridge_account_checkpoint_ofhook_cb(void *br_)
-{
- struct bridge *br = br_;
- long long int now;
- size_t i;
-
- if (!br->has_bonded_ports) {
- return;
- }
-
- now = time_msec();
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- if (port->n_ifaces > 1 && port->bond_mode != BM_AB
- && now >= port->bond_next_rebalance) {
- port->bond_next_rebalance = now + port->bond_rebalance_interval;
- bond_rebalance_port(port);
- }
- }
-}
-
-static struct ofhooks bridge_ofhooks = {
- bridge_normal_ofhook_cb,
- bridge_special_ofhook_cb,
- bridge_account_flow_ofhook_cb,
- bridge_account_checkpoint_ofhook_cb,
-};
-\f
-/* LACP functions. */
-
-static void
-lacp_process_packet(const struct ofpbuf *packet, struct iface *iface)
-{
- const struct lacp_pdu *pdu;
-
- if (!iface->port->lacp) {
- return;
- }
-
- pdu = parse_lacp_packet(packet);
- if (!pdu) {
- return;
- }
-
- iface->lacp_status |= LACP_CURRENT;
- iface->lacp_status &= ~(LACP_EXPIRED | LACP_DEFAULTED);
- iface->lacp_rx = time_msec() + LACP_SLOW_TIME_RX;
-
- iface->lacp_actor.state = iface_get_lacp_state(iface);
- if (memcmp(&iface->lacp_actor, &pdu->partner, sizeof pdu->partner)) {
- iface->lacp_tx = 0;
- }
-
- if (memcmp(&iface->lacp_partner, &pdu->actor, sizeof pdu->actor)) {
- iface->port->lacp_need_update = true;
- iface->lacp_partner = pdu->actor;
- }
-}
-
-static void
-lacp_update_ifaces(struct port *port)
-{
- size_t i;
- struct iface *lead;
- struct lacp_info lead_pri;
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 10);
-
- port->lacp_need_update = false;
- COVERAGE_INC(bridge_lacp_update);
-
- if (!port->lacp) {
- return;
- }
-
- VLOG_DBG_RL(&rl, "port %s: re-evaluating LACP link status", port->name);
-
- lead = NULL;
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
- struct lacp_info pri;
-
- iface->lacp_status |= LACP_ATTACHED;
- ofproto_revalidate(port->bridge->ofproto, iface->tag);
-
- /* Don't allow loopback interfaces to send traffic or lead. */
- if (eth_addr_equals(iface->lacp_partner.sysid,
- iface->lacp_actor.sysid)) {
- VLOG_WARN_RL(&rl, "iface %s: Loopback detected. Interface is "
- "connected to its own bridge", iface->name);
- iface->lacp_status &= ~LACP_ATTACHED;
- continue;
- }
-
- if (iface->lacp_status & LACP_DEFAULTED) {
- continue;
- }
-
- iface_get_lacp_priority(iface, &pri);
-
- if (!lead || memcmp(&pri, &lead_pri, sizeof pri) < 0) {
- lead = iface;
- lead_pri = pri;
- }
- }
-
- if (!lead) {
- port->lacp &= ~LACP_NEGOTIATED;
- return;
- }
-
- port->lacp |= LACP_NEGOTIATED;
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
-
- if (iface->lacp_status & LACP_DEFAULTED
- || lead->lacp_partner.key != iface->lacp_partner.key
- || !eth_addr_equals(lead->lacp_partner.sysid,
- iface->lacp_partner.sysid)) {
- iface->lacp_status &= ~LACP_ATTACHED;
- }
- }
-}
-
-static bool
-lacp_iface_may_tx(const struct iface *iface)
-{
- return iface->port->lacp & LACP_ACTIVE
- || iface->lacp_status & (LACP_CURRENT | LACP_EXPIRED);
-}
-
-static void
-lacp_run(struct port *port)
-{
- size_t i;
- struct ofpbuf packet;
-
- if (!port->lacp) {
- return;
- }
-
- ofpbuf_init(&packet, ETH_HEADER_LEN + LACP_PDU_LEN);
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
-
- if (time_msec() > iface->lacp_rx) {
- if (iface->lacp_status & LACP_CURRENT) {
- iface_set_lacp_expired(iface);
- } else if (iface->lacp_status & LACP_EXPIRED) {
- iface_set_lacp_defaulted(iface);
- }
- }
- }
-
- if (port->lacp_need_update) {
- lacp_update_ifaces(port);
- }
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
- uint8_t ea[ETH_ADDR_LEN];
- int error;
-
- if (time_msec() < iface->lacp_tx || !lacp_iface_may_tx(iface)) {
- continue;
- }
-
- error = netdev_get_etheraddr(iface->netdev, ea);
- if (!error) {
- struct lacp_pdu pdu;
-
- iface->lacp_actor.state = iface_get_lacp_state(iface);
- compose_lacp_pdu(&iface->lacp_actor, &iface->lacp_partner, &pdu);
- compose_lacp_packet(&packet, ea, &pdu);
- iface_send_packet(iface, &packet);
- } else {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 10);
- VLOG_ERR_RL(&rl, "iface %s: failed to obtain Ethernet address "
- "(%s)", iface->name, strerror(error));
- }
-
- iface->lacp_tx = time_msec() +
- (iface->lacp_partner.state & LACP_STATE_TIME
- ? LACP_FAST_TIME_TX
- : LACP_SLOW_TIME_TX);
- }
- ofpbuf_uninit(&packet);
-}
-
-static void
-lacp_wait(struct port *port)
-{
- size_t i;
-
- if (!port->lacp) {
- return;
- }
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
-
- if (lacp_iface_may_tx(iface)) {
- poll_timer_wait_until(iface->lacp_tx);
- }
-
- if (iface->lacp_status & (LACP_CURRENT | LACP_EXPIRED)) {
- poll_timer_wait_until(iface->lacp_rx);
- }
- }
-}
-\f
-/* Bonding functions. */
-
-/* Statistics for a single interface on a bonded port, used for load-based
- * bond rebalancing. */
-struct slave_balance {
- struct iface *iface; /* The interface. */
- uint64_t tx_bytes; /* Sum of hashes[*]->tx_bytes. */
-
- /* All the "bond_entry"s that are assigned to this interface, in order of
- * increasing tx_bytes. */
- struct bond_entry **hashes;
- size_t n_hashes;
-};
-
-static const char *
-bond_mode_to_string(enum bond_mode bm) {
- static char *bm_slb = "balance-slb";
- static char *bm_ab = "active-backup";
- static char *bm_tcp = "balance-tcp";
-
- switch (bm) {
- case BM_SLB: return bm_slb;
- case BM_AB: return bm_ab;
- case BM_TCP: return bm_tcp;
- }
-
- NOT_REACHED();
- return NULL;
-}
-
-/* Sorts pointers to pointers to bond_entries in ascending order by the
- * interface to which they are assigned, and within a single interface in
- * ascending order of bytes transmitted. */
-static int
-compare_bond_entries(const void *a_, const void *b_)
-{
- const struct bond_entry *const *ap = a_;
- const struct bond_entry *const *bp = b_;
- const struct bond_entry *a = *ap;
- const struct bond_entry *b = *bp;
- if (a->iface_idx != b->iface_idx) {
- return a->iface_idx > b->iface_idx ? 1 : -1;
- } else if (a->tx_bytes != b->tx_bytes) {
- return a->tx_bytes > b->tx_bytes ? 1 : -1;
- } else {
- return 0;
- }
-}
-
-/* Sorts slave_balances so that enabled ports come first, and otherwise in
- * *descending* order by number of bytes transmitted. */
-static int
-compare_slave_balance(const void *a_, const void *b_)
-{
- const struct slave_balance *a = a_;
- const struct slave_balance *b = b_;
- if (a->iface->enabled != b->iface->enabled) {
- return a->iface->enabled ? -1 : 1;
- } else if (a->tx_bytes != b->tx_bytes) {
- return a->tx_bytes > b->tx_bytes ? -1 : 1;
- } else {
- return 0;
- }
-}
-
-static void
-swap_bals(struct slave_balance *a, struct slave_balance *b)
-{
- struct slave_balance tmp = *a;
- *a = *b;
- *b = tmp;
-}
-
-/* Restores the 'n_bals' slave_balance structures in 'bals' to sorted order
- * given that 'p' (and only 'p') might be in the wrong location.
- *
- * This function invalidates 'p', since it might now be in a different memory
- * location. */
-static void
-resort_bals(struct slave_balance *p,
- struct slave_balance bals[], size_t n_bals)
-{
- if (n_bals > 1) {
- for (; p > bals && p->tx_bytes > p[-1].tx_bytes; p--) {
- swap_bals(p, p - 1);
- }
- for (; p < &bals[n_bals - 1] && p->tx_bytes < p[1].tx_bytes; p++) {
- swap_bals(p, p + 1);
- }
- }
-}
-
-static void
-log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port)
-{
- if (VLOG_IS_DBG_ENABLED()) {
- struct ds ds = DS_EMPTY_INITIALIZER;
- const struct slave_balance *b;
-
- for (b = bals; b < bals + n_bals; b++) {
- size_t i;
-
- if (b > bals) {
- ds_put_char(&ds, ',');
- }
- ds_put_format(&ds, " %s %"PRIu64"kB",
- b->iface->name, b->tx_bytes / 1024);
-
- if (!b->iface->enabled) {
- ds_put_cstr(&ds, " (disabled)");
- }
- if (b->n_hashes > 0) {
- ds_put_cstr(&ds, " (");
- for (i = 0; i < b->n_hashes; i++) {
- const struct bond_entry *e = b->hashes[i];
- if (i > 0) {
- ds_put_cstr(&ds, " + ");
- }
- ds_put_format(&ds, "h%td: %"PRIu64"kB",
- e - port->bond_hash, e->tx_bytes / 1024);
- }
- ds_put_cstr(&ds, ")");
- }
- }
- VLOG_DBG("bond %s:%s", port->name, ds_cstr(&ds));
- ds_destroy(&ds);
- }
-}
-
-/* Shifts 'hash' from 'from' to 'to' within 'port'. */
-static void
-bond_shift_load(struct slave_balance *from, struct slave_balance *to,
- int hash_idx)
-{
- struct bond_entry *hash = from->hashes[hash_idx];
- struct port *port = from->iface->port;
- uint64_t delta = hash->tx_bytes;
-
- assert(port->bond_mode != BM_AB);
-
- VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
- "from %s to %s (now carrying %"PRIu64"kB and "
- "%"PRIu64"kB load, respectively)",
- port->name, delta / 1024, hash - port->bond_hash,
- from->iface->name, to->iface->name,
- (from->tx_bytes - delta) / 1024,
- (to->tx_bytes + delta) / 1024);
-
- /* Delete element from from->hashes.
- *
- * We don't bother to add the element to to->hashes because not only would
- * it require more work, the only purpose it would be to allow that hash to
- * be migrated to another slave in this rebalancing run, and there is no
- * point in doing that. */
- if (hash_idx == 0) {
- from->hashes++;
- } else {
- memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1,
- (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes);
- }
- from->n_hashes--;
-
- /* Shift load away from 'from' to 'to'. */
- from->tx_bytes -= delta;
- to->tx_bytes += delta;
-
- /* Arrange for flows to be revalidated. */
- ofproto_revalidate(port->bridge->ofproto, hash->iface_tag);
- hash->iface_idx = to->iface->port_ifidx;
- hash->iface_tag = tag_create_random();
-}
-
-static void
-bond_rebalance_port(struct port *port)
-{
- struct slave_balance *bals;
- size_t n_bals;
- struct bond_entry *hashes[BOND_MASK + 1];
- struct slave_balance *b, *from, *to;
- struct bond_entry *e;
- size_t i;
-
- assert(port->bond_mode != BM_AB);
-
- /* Sets up 'bals' to describe each of the port's interfaces, sorted in
- * descending order of tx_bytes, so that bals[0] represents the most
- * heavily loaded slave and bals[n_bals - 1] represents the least heavily
- * loaded slave.
- *
- * The code is a bit tricky: to avoid dynamically allocating a 'hashes'
- * array for each slave_balance structure, we sort our local array of
- * hashes in order by slave, so that all of the hashes for a given slave
- * become contiguous in memory, and then we point each 'hashes' members of
- * a slave_balance structure to the start of a contiguous group. */
- n_bals = port->n_ifaces;
- bals = xmalloc(n_bals * sizeof *bals);
- for (b = bals; b < &bals[n_bals]; b++) {
- b->iface = port->ifaces[b - bals];
- b->tx_bytes = 0;
- b->hashes = NULL;
- b->n_hashes = 0;
- }
- for (i = 0; i <= BOND_MASK; i++) {
- hashes[i] = &port->bond_hash[i];
- }
- qsort(hashes, BOND_MASK + 1, sizeof *hashes, compare_bond_entries);
- for (i = 0; i <= BOND_MASK; i++) {
- e = hashes[i];
- if (e->iface_idx >= 0 && e->iface_idx < port->n_ifaces) {
- b = &bals[e->iface_idx];
- b->tx_bytes += e->tx_bytes;
- if (!b->hashes) {
- b->hashes = &hashes[i];
- }
- b->n_hashes++;
- }
- }
- qsort(bals, n_bals, sizeof *bals, compare_slave_balance);
- log_bals(bals, n_bals, port);
-
- /* Discard slaves that aren't enabled (which were sorted to the back of the
- * array earlier). */
- while (!bals[n_bals - 1].iface->enabled) {
- n_bals--;
- if (!n_bals) {
- goto exit;
- }
- }
-
- /* Shift load from the most-loaded slaves to the least-loaded slaves. */
- to = &bals[n_bals - 1];
- for (from = bals; from < to; ) {
- uint64_t overload = from->tx_bytes - to->tx_bytes;
- if (overload < to->tx_bytes >> 5 || overload < 100000) {
- /* The extra load on 'from' (and all less-loaded slaves), compared
- * to that of 'to' (the least-loaded slave), is less than ~3%, or
- * it is less than ~1Mbps. No point in rebalancing. */
- break;
- } else if (from->n_hashes == 1) {
- /* 'from' only carries a single MAC hash, so we can't shift any
- * load away from it, even though we want to. */
- from++;
- } else {
- /* 'from' is carrying significantly more load than 'to', and that
- * load is split across at least two different hashes. Pick a hash
- * to migrate to 'to' (the least-loaded slave), given that doing so
- * must decrease the ratio of the load on the two slaves by at
- * least 0.1.
- *
- * The sort order we use means that we prefer to shift away the
- * smallest hashes instead of the biggest ones. There is little
- * reason behind this decision; we could use the opposite sort
- * order to shift away big hashes ahead of small ones. */
- bool order_swapped;
-
- for (i = 0; i < from->n_hashes; i++) {
- double old_ratio, new_ratio;
- uint64_t delta = from->hashes[i]->tx_bytes;
-
- if (delta == 0 || from->tx_bytes - delta == 0) {
- /* Pointless move. */
- continue;
- }
-
- order_swapped = from->tx_bytes - delta < to->tx_bytes + delta;
-
- if (to->tx_bytes == 0) {
- /* Nothing on the new slave, move it. */
- break;
- }
-
- old_ratio = (double)from->tx_bytes / to->tx_bytes;
- new_ratio = (double)(from->tx_bytes - delta) /
- (to->tx_bytes + delta);
-
- if (new_ratio == 0) {
- /* Should already be covered but check to prevent division
- * by zero. */
- continue;
- }
-
- if (new_ratio < 1) {
- new_ratio = 1 / new_ratio;
- }
-
- if (old_ratio - new_ratio > 0.1) {
- /* Would decrease the ratio, move it. */
- break;
- }
- }
- if (i < from->n_hashes) {
- bond_shift_load(from, to, i);
-
- /* If the result of the migration changed the relative order of
- * 'from' and 'to' swap them back to maintain invariants. */
- if (order_swapped) {
- swap_bals(from, to);
- }
-
- /* Re-sort 'bals'. Note that this may make 'from' and 'to'
- * point to different slave_balance structures. It is only
- * valid to do these two operations in a row at all because we
- * know that 'from' will not move past 'to' and vice versa. */
- resort_bals(from, bals, n_bals);
- resort_bals(to, bals, n_bals);
- } else {
- from++;
- }
- }
- }
-
- /* Implement exponentially weighted moving average. A weight of 1/2 causes
- * historical data to decay to <1% in 7 rebalancing runs. */
- for (e = &port->bond_hash[0]; e <= &port->bond_hash[BOND_MASK]; e++) {
- e->tx_bytes /= 2;
- }
-
-exit:
- free(bals);
-}
-
-static void
-bond_send_learning_packets(struct port *port)
-{
- struct bridge *br = port->bridge;
- struct mac_entry *e;
- struct ofpbuf packet;
- int error, n_packets, n_errors;
-
- if (!port->n_ifaces || port->active_iface < 0 || bond_is_tcp_hash(port)) {
- return;
- }
-
- ofpbuf_init(&packet, 128);
- error = n_packets = n_errors = 0;
- LIST_FOR_EACH (e, lru_node, &br->ml->lrus) {
- union ofp_action actions[2], *a;
- uint16_t dp_ifidx;
- tag_type tags = 0;
- struct flow flow;
- int retval;
-
- if (e->port == port->port_idx) {
- continue;
- }
-
- compose_benign_packet(&packet, "Open vSwitch Bond Failover", 0xf177,
- e->mac);
- flow_extract(&packet, 0, ODPP_NONE, &flow);
-
- if (!choose_output_iface(port, &flow, e->vlan, &dp_ifidx, &tags)) {
- continue;
- }
-
- /* Compose actions. */
- memset(actions, 0, sizeof actions);
- a = actions;
- if (e->vlan) {
- a->vlan_vid.type = htons(OFPAT_SET_VLAN_VID);
- a->vlan_vid.len = htons(sizeof *a);
- a->vlan_vid.vlan_vid = htons(e->vlan);
- a++;
- }
- a->output.type = htons(OFPAT_OUTPUT);
- a->output.len = htons(sizeof *a);
- a->output.port = htons(odp_port_to_ofp_port(dp_ifidx));
- a++;
-
- /* Send packet. */
- n_packets++;
- retval = ofproto_send_packet(br->ofproto, &flow, actions, a - actions,
- &packet);
- if (retval) {
- error = retval;
- n_errors++;
- }
- }
- ofpbuf_uninit(&packet);
-
- if (n_errors) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_WARN_RL(&rl, "bond %s: %d errors sending %d gratuitous learning "
- "packets, last error was: %s",
- port->name, n_errors, n_packets, strerror(error));
- } else {
- VLOG_DBG("bond %s: sent %d gratuitous learning packets",
- port->name, n_packets);
- }
-}
-\f
-/* Bonding unixctl user interface functions. */
-
-static void
-bond_unixctl_list(struct unixctl_conn *conn,
- const char *args OVS_UNUSED, void *aux OVS_UNUSED)
-{
- struct ds ds = DS_EMPTY_INITIALIZER;
- const struct bridge *br;
-
- ds_put_cstr(&ds, "bridge\tbond\ttype\tslaves\n");
-
- LIST_FOR_EACH (br, node, &all_bridges) {
- size_t i;
-
- for (i = 0; i < br->n_ports; i++) {
- const struct port *port = br->ports[i];
- if (port->n_ifaces > 1) {
- size_t j;
-
- ds_put_format(&ds, "%s\t%s\t%s\t", br->name, port->name,
- bond_mode_to_string(port->bond_mode));
- for (j = 0; j < port->n_ifaces; j++) {
- const struct iface *iface = port->ifaces[j];
- if (j) {
- ds_put_cstr(&ds, ", ");
- }
- ds_put_cstr(&ds, iface->name);
- }
- ds_put_char(&ds, '\n');
- }
- }
- }
- unixctl_command_reply(conn, 200, ds_cstr(&ds));
- ds_destroy(&ds);
-}
-
-static struct port *
-bond_find(const char *name)
-{
- const struct bridge *br;
-
- LIST_FOR_EACH (br, node, &all_bridges) {
- size_t i;
-
- for (i = 0; i < br->n_ports; i++) {
- struct port *port = br->ports[i];
- if (!strcmp(port->name, name) && port->n_ifaces > 1) {
- return port;
- }
- }
- }
- return NULL;
-}
-
-static void
-ds_put_lacp_state(struct ds *ds, uint8_t state)
-{
- if (state & LACP_STATE_ACT) {
- ds_put_cstr(ds, "activity ");
- }
-
- if (state & LACP_STATE_TIME) {
- ds_put_cstr(ds, "timeout ");
- }
-
- if (state & LACP_STATE_AGG) {
- ds_put_cstr(ds, "aggregation ");
- }
-
- if (state & LACP_STATE_SYNC) {
- ds_put_cstr(ds, "synchronized ");
- }
-
- if (state & LACP_STATE_COL) {
- ds_put_cstr(ds, "collecting ");
- }
-
- if (state & LACP_STATE_DIST) {
- ds_put_cstr(ds, "distributing ");
- }
-
- if (state & LACP_STATE_DEF) {
- ds_put_cstr(ds, "defaulted ");
- }
-
- if (state & LACP_STATE_EXP) {
- ds_put_cstr(ds, "expired ");
- }
-}
-
-static void
-bond_unixctl_show(struct unixctl_conn *conn,
- const char *args, void *aux OVS_UNUSED)
-{
- struct ds ds = DS_EMPTY_INITIALIZER;
- const struct port *port;
- size_t j;
-
- port = bond_find(args);
- if (!port) {
- unixctl_command_reply(conn, 501, "no such bond");
- return;
- }
-
- ds_put_format(&ds, "bond_mode: %s\n",
- bond_mode_to_string(port->bond_mode));
-
- if (port->lacp) {
- ds_put_format(&ds, "lacp: %s\n",
- port->lacp & LACP_ACTIVE ? "active" : "passive");
- } else {
- ds_put_cstr(&ds, "lacp: off\n");
- }
-
- if (port->bond_mode != BM_AB) {
- ds_put_format(&ds, "bond-hash-algorithm: %s\n",
- bond_is_tcp_hash(port) ? "balance-tcp" : "balance-slb");
- }
-
-
- ds_put_format(&ds, "bond-detect-mode: %s\n",
- port->monitor ? "carrier" : "miimon");
-
- if (!port->monitor) {
- ds_put_format(&ds, "bond-miimon-interval: %lld\n",
- port->miimon_interval);
- }
-
- ds_put_format(&ds, "updelay: %d ms\n", port->updelay);
- ds_put_format(&ds, "downdelay: %d ms\n", port->downdelay);
-
- if (port->bond_mode != BM_AB) {
- ds_put_format(&ds, "next rebalance: %lld ms\n",
- port->bond_next_rebalance - time_msec());
- }
-
- for (j = 0; j < port->n_ifaces; j++) {
- const struct iface *iface = port->ifaces[j];
- struct bond_entry *be;
- struct flow flow;
-
- /* Basic info. */
- ds_put_format(&ds, "\nslave %s: %s\n",
- iface->name, iface->enabled ? "enabled" : "disabled");
- if (j == port->active_iface) {
- ds_put_cstr(&ds, "\tactive slave\n");
- }
- if (iface->delay_expires != LLONG_MAX) {
- ds_put_format(&ds, "\t%s expires in %lld ms\n",
- iface->enabled ? "downdelay" : "updelay",
- iface->delay_expires - time_msec());
- }
-
- if (port->lacp) {
- ds_put_cstr(&ds, "\tstatus: ");
-
- if (iface->lacp_status & LACP_CURRENT) {
- ds_put_cstr(&ds, "current ");
- }
-
- if (iface->lacp_status & LACP_EXPIRED) {
- ds_put_cstr(&ds, "expired ");
- }
-
- if (iface->lacp_status & LACP_DEFAULTED) {
- ds_put_cstr(&ds, "defaulted ");
- }
-
- if (iface->lacp_status & LACP_ATTACHED) {
- ds_put_cstr(&ds, "attached ");
- }
-
- ds_put_cstr(&ds, "\n");
-
- ds_put_cstr(&ds, "\n\tactor sysid: ");
- ds_put_format(&ds, ETH_ADDR_FMT,
- ETH_ADDR_ARGS(iface->lacp_actor.sysid));
- ds_put_cstr(&ds, "\n");
-
- ds_put_format(&ds, "\tactor sys_priority: %u\n",
- ntohs(iface->lacp_actor.sys_priority));
-
- ds_put_format(&ds, "\tactor portid: %u\n",
- ntohs(iface->lacp_actor.portid));
-
- ds_put_format(&ds, "\tactor port_priority: %u\n",
- ntohs(iface->lacp_actor.port_priority));
-
- ds_put_format(&ds, "\tactor key: %u\n",
- ntohs(iface->lacp_actor.key));
-
- ds_put_cstr(&ds, "\tactor state: ");
- ds_put_lacp_state(&ds, iface_get_lacp_state(iface));
- ds_put_cstr(&ds, "\n\n");
-
- ds_put_cstr(&ds, "\tpartner sysid: ");
- ds_put_format(&ds, ETH_ADDR_FMT,
- ETH_ADDR_ARGS(iface->lacp_partner.sysid));
- ds_put_cstr(&ds, "\n");
-
- ds_put_format(&ds, "\tpartner sys_priority: %u\n",
- ntohs(iface->lacp_partner.sys_priority));
-
- ds_put_format(&ds, "\tpartner portid: %u\n",
- ntohs(iface->lacp_partner.portid));
-
- ds_put_format(&ds, "\tpartner port_priority: %u\n",
- ntohs(iface->lacp_partner.port_priority));
-
- ds_put_format(&ds, "\tpartner key: %u\n",
- ntohs(iface->lacp_partner.key));
-
- ds_put_cstr(&ds, "\tpartner state: ");
- ds_put_lacp_state(&ds, iface->lacp_partner.state);
- ds_put_cstr(&ds, "\n");
- }
-
- if (port->bond_mode == BM_AB) {
- continue;
- }
-
- /* Hashes. */
- memset(&flow, 0, sizeof flow);
- for (be = port->bond_hash; be <= &port->bond_hash[BOND_MASK]; be++) {
- int hash = be - port->bond_hash;
- struct mac_entry *me;
-
- if (be->iface_idx != j) {
- continue;
- }
-
- ds_put_format(&ds, "\thash %d: %"PRIu64" kB load\n",
- hash, be->tx_bytes / 1024);
-
- if (port->bond_mode != BM_SLB) {
- continue;
- }
-
- /* MACs. */
- LIST_FOR_EACH (me, lru_node, &port->bridge->ml->lrus) {
- uint16_t dp_ifidx;
- tag_type tags = 0;
-
- memcpy(flow.dl_src, me->mac, ETH_ADDR_LEN);
- if (bond_hash_src(me->mac, me->vlan) == hash
- && me->port != port->port_idx
- && choose_output_iface(port, &flow, me->vlan,
- &dp_ifidx, &tags)
- && dp_ifidx == iface->dp_ifidx)
- {
- ds_put_format(&ds, "\t\t"ETH_ADDR_FMT"\n",
- ETH_ADDR_ARGS(me->mac));
- }
- }
- }
- }
- unixctl_command_reply(conn, 200, ds_cstr(&ds));
- ds_destroy(&ds);
-}
-
-static void
-bond_unixctl_migrate(struct unixctl_conn *conn, const char *args_,
- void *aux OVS_UNUSED)
-{
- char *args = (char *) args_;
- char *save_ptr = NULL;
- char *bond_s, *hash_s, *slave_s;
- struct port *port;
- struct iface *iface;
- struct bond_entry *entry;
- int hash;
-
- bond_s = strtok_r(args, " ", &save_ptr);
- hash_s = strtok_r(NULL, " ", &save_ptr);
- slave_s = strtok_r(NULL, " ", &save_ptr);
- if (!slave_s) {
- unixctl_command_reply(conn, 501,
- "usage: bond/migrate BOND HASH SLAVE");
- return;
- }
-
- port = bond_find(bond_s);
- if (!port) {
- unixctl_command_reply(conn, 501, "no such bond");
- return;
- }
-
- if (port->bond_mode != BM_SLB) {
- unixctl_command_reply(conn, 501, "not an SLB bond");
- return;
- }
-
- if (strspn(hash_s, "0123456789") == strlen(hash_s)) {
- hash = atoi(hash_s) & BOND_MASK;
- } else {
- unixctl_command_reply(conn, 501, "bad hash");
- return;
- }
-
- iface = port_lookup_iface(port, slave_s);
- if (!iface) {
- unixctl_command_reply(conn, 501, "no such slave");
- return;
- }
-
- if (!iface->enabled) {
- unixctl_command_reply(conn, 501, "cannot migrate to disabled slave");
- return;
- }
-
- entry = &port->bond_hash[hash];
- ofproto_revalidate(port->bridge->ofproto, entry->iface_tag);
- entry->iface_idx = iface->port_ifidx;
- entry->iface_tag = tag_create_random();
- unixctl_command_reply(conn, 200, "migrated");
-}
-
-static void
-bond_unixctl_set_active_slave(struct unixctl_conn *conn, const char *args_,
- void *aux OVS_UNUSED)
-{
- char *args = (char *) args_;
- char *save_ptr = NULL;
- char *bond_s, *slave_s;
- struct port *port;
- struct iface *iface;
-
- bond_s = strtok_r(args, " ", &save_ptr);
- slave_s = strtok_r(NULL, " ", &save_ptr);
- if (!slave_s) {
- unixctl_command_reply(conn, 501,
- "usage: bond/set-active-slave BOND SLAVE");
- return;
- }
-
- port = bond_find(bond_s);
- if (!port) {
- unixctl_command_reply(conn, 501, "no such bond");
- return;
- }
-
- iface = port_lookup_iface(port, slave_s);
- if (!iface) {
- unixctl_command_reply(conn, 501, "no such slave");
- return;
- }
-
- if (!iface->enabled) {
- unixctl_command_reply(conn, 501, "cannot make disabled slave active");
- return;
- }
-
- if (port->active_iface != iface->port_ifidx) {
- ofproto_revalidate(port->bridge->ofproto, port->active_iface_tag);
- port->active_iface = iface->port_ifidx;
- port->active_iface_tag = tag_create_random();
- VLOG_INFO("port %s: active interface is now %s",
- port->name, iface->name);
- bond_send_learning_packets(port);
- unixctl_command_reply(conn, 200, "done");
- } else {
- unixctl_command_reply(conn, 200, "no change");
- }
-}
-
-static void
-enable_slave(struct unixctl_conn *conn, const char *args_, bool enable)
-{
- char *args = (char *) args_;
- char *save_ptr = NULL;
- char *bond_s, *slave_s;
- struct port *port;
- struct iface *iface;
-
- bond_s = strtok_r(args, " ", &save_ptr);
- slave_s = strtok_r(NULL, " ", &save_ptr);
- if (!slave_s) {
- unixctl_command_reply(conn, 501,
- "usage: bond/enable/disable-slave BOND SLAVE");
- return;
- }
-
- port = bond_find(bond_s);
- if (!port) {
- unixctl_command_reply(conn, 501, "no such bond");
- return;
- }
-
- iface = port_lookup_iface(port, slave_s);
- if (!iface) {
- unixctl_command_reply(conn, 501, "no such slave");
- return;
- }
-
- bond_enable_slave(iface, enable);
- unixctl_command_reply(conn, 501, enable ? "enabled" : "disabled");
-}
-
-static void
-bond_unixctl_enable_slave(struct unixctl_conn *conn, const char *args,
- void *aux OVS_UNUSED)
-{
- enable_slave(conn, args, true);
-}
-
-static void
-bond_unixctl_disable_slave(struct unixctl_conn *conn, const char *args,
- void *aux OVS_UNUSED)
-{
- enable_slave(conn, args, false);
-}
-
-static void
-bond_unixctl_hash(struct unixctl_conn *conn, const char *args_,
- void *aux OVS_UNUSED)
-{
- char *args = (char *) args_;
- uint8_t mac[ETH_ADDR_LEN];
- uint8_t hash;
- char *hash_cstr;
- unsigned int vlan;
- char *mac_s, *vlan_s;
- char *save_ptr = NULL;
-
- mac_s = strtok_r(args, " ", &save_ptr);
- vlan_s = strtok_r(NULL, " ", &save_ptr);
-
- if (vlan_s) {
- if (sscanf(vlan_s, "%u", &vlan) != 1) {
- unixctl_command_reply(conn, 501, "invalid vlan");
- return;
- }
- } else {
- vlan = OFP_VLAN_NONE;
- }
-
- if (sscanf(mac_s, ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))
- == ETH_ADDR_SCAN_COUNT) {
- hash = bond_hash_src(mac, vlan);
-
- hash_cstr = xasprintf("%u", hash);
- unixctl_command_reply(conn, 200, hash_cstr);
- free(hash_cstr);
- } else {
- unixctl_command_reply(conn, 501, "invalid mac");
- }
-}
-
-static void
-bond_init(void)
-{
- unixctl_command_register("bond/list", bond_unixctl_list, NULL);
- unixctl_command_register("bond/show", bond_unixctl_show, NULL);
- unixctl_command_register("bond/migrate", bond_unixctl_migrate, NULL);
- unixctl_command_register("bond/set-active-slave",
- bond_unixctl_set_active_slave, NULL);
- unixctl_command_register("bond/enable-slave", bond_unixctl_enable_slave,
- NULL);
- unixctl_command_register("bond/disable-slave", bond_unixctl_disable_slave,
- NULL);
- unixctl_command_register("bond/hash", bond_unixctl_hash, NULL);
-}
-\f
-/* Port functions. */
-
-static void
-port_run(struct port *port)
-{
- size_t i;
-
- if (port->monitor) {
- char *devname;
-
- /* Track carrier going up and down on interfaces. */
- while (!netdev_monitor_poll(port->monitor, &devname)) {
- struct iface *iface;
-
- iface = port_lookup_iface(port, devname);
- if (iface) {
- iface_update_carrier(iface, netdev_get_carrier(iface->netdev));
- }
- free(devname);
- }
- } else if (time_msec() >= port->miimon_next_update) {
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
- iface_update_carrier(iface, netdev_get_miimon(iface->netdev));
- }
- port->miimon_next_update = time_msec() + port->miimon_interval;
- }
-
- lacp_run(port);
- bond_run(port);
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
-
- if (iface->cfm) {
- struct ofpbuf *packet = cfm_run(iface->cfm);
- if (packet) {
- iface_send_packet(iface, packet);
- ofpbuf_uninit(packet);
- free(packet);
- }
- }
- }
-}
-
-static void
-port_wait(struct port *port)
-{
- size_t i;
-
- if (port->monitor) {
- netdev_monitor_poll_wait(port->monitor);
- } else {
- poll_timer_wait_until(port->miimon_next_update);
- }
-
- lacp_wait(port);
- bond_wait(port);
-
- for (i = 0; i < port->n_ifaces; i++) {
- struct iface *iface = port->ifaces[i];
- if (iface->cfm) {
- cfm_wait(iface->cfm);
- }
- }
-}