port-array: Add port_array_delete() function.
[openvswitch] / vswitchd / bridge.c
index e20d407387785f5f6cb999ef0e7b2d092cbc893e..f6079928f4a7502a6ab2e62b9ad603bb3671834a 100644 (file)
@@ -20,6 +20,7 @@
 #include <arpa/inet.h>
 #include <ctype.h>
 #include <inttypes.h>
+#include <sys/socket.h>
 #include <net/if.h>
 #include <openflow/openflow.h>
 #include <signal.h>
@@ -157,19 +158,11 @@ struct bridge {
     struct list node;           /* Node in global list of bridges. */
     char *name;                 /* User-specified arbitrary name. */
     struct mac_learning *ml;    /* MAC learning table. */
-    bool sent_config_request;   /* Successfully sent config request? */
     uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */
 
     /* OpenFlow switch processing. */
     struct ofproto *ofproto;    /* OpenFlow switch. */
 
-    /* Description strings. */
-    char *mfr_desc;             /* Manufacturer. */
-    char *hw_desc;              /* Hardware. */
-    char *sw_desc;              /* Software version. */
-    char *serial_desc;          /* Serial number. */
-    char *dp_desc;              /* Datapath description. */
-
     /* Kernel datapath information. */
     struct dpif *dpif;          /* Datapath. */
     struct port_array ifaces;   /* Indexed by kernel datapath port number. */
@@ -186,9 +179,6 @@ struct bridge {
     /* Flow tracking. */
     bool flush;
 
-    /* Flow statistics gathering. */
-    time_t next_stats_request;
-
     /* Port mirroring. */
     struct mirror *mirrors[MAX_MIRRORS];
 
@@ -395,10 +385,6 @@ set_up_iface(const struct ovsrec_interface *iface_cfg, struct iface *iface,
         }
         netdev_options.args = &options;
         netdev_options.ethertype = NETDEV_ETH_TYPE_NONE;
-        netdev_options.may_create = true;
-        if (iface_is_internal(iface->port->bridge, iface_cfg->name)) {
-            netdev_options.may_open = true;
-        }
 
         error = netdev_open(&netdev_options, &iface->netdev);
 
@@ -733,7 +719,7 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg)
         dpid = bridge_pick_datapath_id(br, ea, hw_addr_iface);
         ofproto_set_datapath_id(br->ofproto, dpid);
 
-        dpid_string = xasprintf("%012"PRIx64, dpid);
+        dpid_string = xasprintf("%016"PRIx64, dpid);
         ovsrec_bridge_set_datapath_id(br->cfg, dpid_string);
         free(dpid_string);
 
@@ -1205,7 +1191,6 @@ bridge_create(const struct ovsrec_bridge *br_cfg)
     br->name = xstrdup(br_cfg->name);
     br->cfg = br_cfg;
     br->ml = mac_learning_create();
-    br->sent_config_request = false;
     eth_addr_nicira_random(br->default_ea);
 
     port_array_init(&br->ifaces);
@@ -1343,75 +1328,6 @@ bridge_get_controllers(const struct ovsrec_open_vswitch *ovs_cfg,
     return n_controllers;
 }
 
-static void
-bridge_update_desc(struct bridge *br OVS_UNUSED)
-{
-#if 0
-    bool changed = false;
-    const char *desc;
-
-    desc = cfg_get_string(0, "bridge.%s.mfr-desc", br->name);
-    if (desc != br->mfr_desc) {
-        free(br->mfr_desc);
-        if (desc) {
-            br->mfr_desc = xstrdup(desc);
-        } else {
-            br->mfr_desc = xstrdup(DEFAULT_MFR_DESC);
-        }
-        changed = true;
-    }
-
-    desc = cfg_get_string(0, "bridge.%s.hw-desc", br->name);
-    if (desc != br->hw_desc) {
-        free(br->hw_desc);
-        if (desc) {
-            br->hw_desc = xstrdup(desc);
-        } else {
-            br->hw_desc = xstrdup(DEFAULT_HW_DESC);
-        }
-        changed = true;
-    }
-
-    desc = cfg_get_string(0, "bridge.%s.sw-desc", br->name);
-    if (desc != br->sw_desc) {
-        free(br->sw_desc);
-        if (desc) {
-            br->sw_desc = xstrdup(desc);
-        } else {
-            br->sw_desc = xstrdup(DEFAULT_SW_DESC);
-        }
-        changed = true;
-    }
-
-    desc = cfg_get_string(0, "bridge.%s.serial-desc", br->name);
-    if (desc != br->serial_desc) {
-        free(br->serial_desc);
-        if (desc) {
-            br->serial_desc = xstrdup(desc);
-        } else {
-            br->serial_desc = xstrdup(DEFAULT_SERIAL_DESC);
-        }
-        changed = true;
-    }
-
-    desc = cfg_get_string(0, "bridge.%s.dp-desc", br->name);
-    if (desc != br->dp_desc) {
-        free(br->dp_desc);
-        if (desc) {
-            br->dp_desc = xstrdup(desc);
-        } else {
-            br->dp_desc = xstrdup(DEFAULT_DP_DESC);
-        }
-        changed = true;
-    }
-
-    if (changed) {
-        ofproto_set_desc(br->ofproto, br->mfr_desc, br->hw_desc,
-                br->sw_desc, br->serial_desc, br->dp_desc);
-    }
-#endif
-}
-
 static void
 bridge_reconfigure_one(const struct ovsrec_open_vswitch *ovs_cfg,
                        struct bridge *br)
@@ -1492,50 +1408,7 @@ bridge_reconfigure_one(const struct ovsrec_open_vswitch *ovs_cfg,
      * versa.  (XXX Should we delete all flows if we are switching from one
      * controller to another?) */
 
-#if 0
-    /* Configure OpenFlow management listeners. */
-    svec_init(&listeners);
-    cfg_get_all_strings(&listeners, "bridge.%s.openflow.listeners", br->name);
-    if (!listeners.n) {
-        svec_add_nocopy(&listeners, xasprintf("punix:%s/%s.mgmt",
-                                              ovs_rundir, br->name));
-    } else if (listeners.n == 1 && !strcmp(listeners.names[0], "none")) {
-        svec_clear(&listeners);
-    }
-    svec_sort_unique(&listeners);
-
-    svec_init(&old_listeners);
-    ofproto_get_listeners(br->ofproto, &old_listeners);
-    svec_sort_unique(&old_listeners);
-
-    if (!svec_equal(&listeners, &old_listeners)) {
-        ofproto_set_listeners(br->ofproto, &listeners);
-    }
-    svec_destroy(&listeners);
-    svec_destroy(&old_listeners);
-
-    /* Configure OpenFlow controller connection snooping. */
-    svec_init(&snoops);
-    cfg_get_all_strings(&snoops, "bridge.%s.openflow.snoops", br->name);
-    if (!snoops.n) {
-        svec_add_nocopy(&snoops, xasprintf("punix:%s/%s.snoop",
-                                           ovs_rundir, br->name));
-    } else if (snoops.n == 1 && !strcmp(snoops.names[0], "none")) {
-        svec_clear(&snoops);
-    }
-    svec_sort_unique(&snoops);
-
-    svec_init(&old_snoops);
-    ofproto_get_snoops(br->ofproto, &old_snoops);
-    svec_sort_unique(&old_snoops);
-
-    if (!svec_equal(&snoops, &old_snoops)) {
-        ofproto_set_snoops(br->ofproto, &snoops);
-    }
-    svec_destroy(&snoops);
-    svec_destroy(&old_snoops);
-#else
-    /* Default listener. */
+    /* Configure OpenFlow management listener. */
     svec_init(&listeners);
     svec_add_nocopy(&listeners, xasprintf("punix:%s/%s.mgmt",
                                           ovs_rundir, br->name));
@@ -1547,7 +1420,7 @@ bridge_reconfigure_one(const struct ovsrec_open_vswitch *ovs_cfg,
     svec_destroy(&listeners);
     svec_destroy(&old_listeners);
 
-    /* Default snoop. */
+    /* Configure OpenFlow controller connection snooping. */
     svec_init(&snoops);
     svec_add_nocopy(&snoops, xasprintf("punix:%s/%s.snoop",
                                        ovs_rundir, br->name));
@@ -1558,11 +1431,8 @@ bridge_reconfigure_one(const struct ovsrec_open_vswitch *ovs_cfg,
     }
     svec_destroy(&snoops);
     svec_destroy(&old_snoops);
-#endif
 
     mirror_reconfigure(br);
-
-    bridge_update_desc(br);
 }
 
 static void
@@ -1930,10 +1800,19 @@ bond_update_fake_iface_stats(struct port *port)
         struct netdev_stats slave_stats;
 
         if (!netdev_get_stats(port->ifaces[i]->netdev, &slave_stats)) {
-            bond_stats.rx_packets += slave_stats.rx_packets;
-            bond_stats.rx_bytes += slave_stats.rx_bytes;
-            bond_stats.tx_packets += slave_stats.tx_packets;
-            bond_stats.tx_bytes += slave_stats.tx_bytes;
+            /* XXX: We swap the stats here because they are swapped back when
+             * reported by the internal device.  The reason for this is
+             * internal devices normally represent packets going into the system
+             * but when used as fake bond device they represent packets leaving
+             * the system.  We really should do this in the internal device
+             * itself because changing it here reverses the counts from the
+             * perspective of the switch.  However, the internal device doesn't
+             * know what type of device it represents so we have to do it here
+             * for now. */
+            bond_stats.tx_packets += slave_stats.rx_packets;
+            bond_stats.tx_bytes += slave_stats.rx_bytes;
+            bond_stats.rx_packets += slave_stats.tx_packets;
+            bond_stats.rx_bytes += slave_stats.tx_bytes;
         }
     }
 
@@ -1986,11 +1865,11 @@ bond_wait(struct bridge *br)
         for (j = 0; j < port->n_ifaces; j++) {
             struct iface *iface = port->ifaces[j];
             if (iface->delay_expires != LLONG_MAX) {
-                poll_timer_wait(iface->delay_expires - time_msec());
+                poll_timer_wait_until(iface->delay_expires);
             }
         }
         if (port->bond_fake_iface) {
-            poll_timer_wait(port->bond_next_fake_iface_update - time_msec());
+            poll_timer_wait_until(port->bond_next_fake_iface_update);
         }
     }
 }
@@ -2249,12 +2128,34 @@ static int flow_get_vlan(struct bridge *br, const flow_t *flow,
     return vlan;
 }
 
+/* A VM broadcasts a gratuitous ARP to indicate that it has resumed after
+ * migration.  Older Citrix-patched Linux DomU used gratuitous ARP replies to
+ * indicate this; newer upstream kernels use gratuitous ARP requests. */
+static bool
+is_gratuitous_arp(const flow_t *flow)
+{
+    return (flow->dl_type == htons(ETH_TYPE_ARP)
+            && eth_addr_is_broadcast(flow->dl_dst)
+            && (flow->nw_proto == ARP_OP_REPLY
+                || (flow->nw_proto == ARP_OP_REQUEST
+                    && flow->nw_src == flow->nw_dst)));
+}
+
 static void
 update_learning_table(struct bridge *br, const flow_t *flow, int vlan,
                       struct port *in_port)
 {
-    tag_type rev_tag = mac_learning_learn(br->ml, flow->dl_src,
-                                          vlan, in_port->port_idx);
+    enum grat_arp_lock_type lock_type;
+    tag_type rev_tag;
+
+    /* We don't want to learn from gratuitous ARP packets that are reflected
+     * back over bond slaves so we lock the learning table. */
+    lock_type = !is_gratuitous_arp(flow) ? GRAT_ARP_LOCK_NONE :
+                    (in_port->n_ifaces == 1) ? GRAT_ARP_LOCK_SET :
+                                               GRAT_ARP_LOCK_CHECK;
+
+    rev_tag = mac_learning_learn(br->ml, flow->dl_src, vlan, in_port->port_idx,
+                                 lock_type);
     if (rev_tag) {
         /* The log messages here could actually be useful in debugging,
          * so keep the rate limit relatively high. */
@@ -2268,14 +2169,6 @@ update_learning_table(struct bridge *br, const flow_t *flow, int vlan,
     }
 }
 
-static bool
-is_bcast_arp_reply(const flow_t *flow)
-{
-    return (flow->dl_type == htons(ETH_TYPE_ARP)
-            && flow->nw_proto == ARP_OP_REPLY
-            && eth_addr_is_broadcast(flow->dl_dst));
-}
-
 /* Determines whether packets in 'flow' within 'br' should be forwarded or
  * dropped.  Returns true if they may be forwarded, false if they should be
  * dropped.
@@ -2354,6 +2247,7 @@ is_admissible(struct bridge *br, const flow_t *flow, bool have_packet,
     /* Packets received on bonds need special attention to avoid duplicates. */
     if (in_port->n_ifaces > 1) {
         int src_idx;
+        bool is_grat_arp_locked;
 
         if (eth_addr_is_multicast(flow->dl_dst)) {
             *tags |= in_port->active_iface_tag;
@@ -2365,11 +2259,15 @@ is_admissible(struct bridge *br, const flow_t *flow, bool have_packet,
 
         /* Drop all packets for which we have learned a different input
          * port, because we probably sent the packet on one slave and got
-         * it back on the other.  Broadcast ARP replies are an exception
-         * to this rule: the host has moved to another switch. */
-        src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
+         * it back on the other.  Gratuitous ARP packets are an exception
+         * to this rule: the host has moved to another switch.  The exception
+         * to the exception is if we locked the learning table to avoid
+         * reflections on bond slaves.  If this is the case, just drop the
+         * packet now. */
+        src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan,
+                                      &is_grat_arp_locked);
         if (src_idx != -1 && src_idx != in_port->port_idx &&
-            !is_bcast_arp_reply(flow)) {
+            (!is_gratuitous_arp(flow) || is_grat_arp_locked)) {
                 return false;
         }
     }
@@ -2402,7 +2300,8 @@ process_flow(struct bridge *br, const flow_t *flow,
     }
 
     /* Determine output port. */
-    out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan, tags);
+    out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan, tags,
+                                           NULL);
     if (out_port_idx >= 0 && out_port_idx < br->n_ports) {
         out_port = br->ports[out_port_idx];
     } else if (!packet && !eth_addr_is_multicast(flow->dl_dst)) {
@@ -3391,8 +3290,6 @@ port_reconfigure(struct port *port, const struct ovsrec_port *cfg)
     }
     bitmap_free(port->trunks);
     port->trunks = trunks;
-
-    shash_destroy(&new_ifaces);
 }
 
 static void
@@ -3615,6 +3512,8 @@ iface_create(struct port *port, const struct ovsrec_interface *if_cfg)
     iface->netdev = NULL;
     iface->cfg = if_cfg;
 
+    shash_add_assert(&br->iface_by_name, iface->name, iface);
+
     /* Attempt to create the network interface in case it doesn't exist yet. */
     if (!iface_is_internal(br, iface->name)) {
         error = set_up_iface(if_cfg, iface, true);
@@ -3622,14 +3521,13 @@ iface_create(struct port *port, const struct ovsrec_interface *if_cfg)
             VLOG_WARN("could not create iface %s: %s", iface->name,
                       strerror(error));
 
+            shash_find_and_delete_assert(&br->iface_by_name, iface->name);
             free(iface->name);
             free(iface);
             return NULL;
         }
     }
 
-    shash_add_assert(&br->iface_by_name, iface->name, iface);
-
     if (port->n_ifaces >= port->allocated_ifaces) {
         port->ifaces = x2nrealloc(port->ifaces, &port->allocated_ifaces,
                                   sizeof *port->ifaces);