static struct bridge *bridge_create(const char *name);
static void bridge_destroy(struct bridge *);
static struct bridge *bridge_lookup(const char *name);
-static void bridge_unixctl_dump_flows(struct unixctl_conn *, const char *);
+static unixctl_cb_func bridge_unixctl_dump_flows;
static int bridge_run_one(struct bridge *);
static void bridge_reconfigure_one(struct bridge *);
static void bridge_reconfigure_controller(struct bridge *);
static struct iface *bridge_get_local_iface(struct bridge *);
static uint64_t dpid_from_hash(const void *, size_t nbytes);
-static void bridge_unixctl_fdb_show(struct unixctl_conn *, const char *args);
+static unixctl_cb_func bridge_unixctl_fdb_show;
static void bond_init(void);
static void bond_run(struct bridge *);
static struct iface *iface_lookup(const struct bridge *, const char *name);
static struct iface *iface_from_dp_ifidx(const struct bridge *,
uint16_t dp_ifidx);
+static bool iface_is_internal(const struct bridge *, const char *name);
+static void iface_set_mac(struct iface *);
/* Hooks into ofproto processing. */
static struct ofhooks bridge_ofhooks;
struct svec dpif_names;
size_t i;
- unixctl_command_register("fdb/show", bridge_unixctl_fdb_show);
+ unixctl_command_register("fdb/show", bridge_unixctl_fdb_show, NULL);
svec_init(&dpif_names);
dp_enumerate(&dpif_names);
}
svec_destroy(&dpif_names);
- unixctl_command_register("bridge/dump-flows", bridge_unixctl_dump_flows);
+ unixctl_command_register("bridge/dump-flows", bridge_unixctl_dump_flows,
+ NULL);
bond_init();
bridge_reconfigure();
}
static bool
-set_iface_policing(struct bridge *br UNUSED, struct iface *iface,
+set_iface_properties(struct bridge *br UNUSED, struct iface *iface,
void *aux UNUSED)
{
- int rate = cfg_get_int(0, "port.%s.ingress.policing-rate", iface->name);
- int burst = cfg_get_int(0, "port.%s.ingress.policing-burst", iface->name);
+ int rate, burst;
+
+ /* Set policing attributes. */
+ rate = cfg_get_int(0, "port.%s.ingress.policing-rate", iface->name);
+ burst = cfg_get_int(0, "port.%s.ingress.policing-burst", iface->name);
netdev_set_policing(iface->netdev, rate, burst);
+
+ /* Set MAC address of internal interfaces other than the local
+ * interface. */
+ if (iface->dp_ifidx != ODPP_LOCAL
+ && iface_is_internal(br, iface->name)) {
+ iface_set_mac(iface);
+ }
+
return true;
}
bool internal;
int error;
- /* It's an internal interface if it's marked that way, or if
- * it's a bonded interface for which we're faking up a network
- * device. */
- internal = cfg_get_bool(0, "iface.%s.internal", if_name);
- if (cfg_get_bool(0, "bonding.%s.fake-iface", if_name)) {
- struct port *port = port_lookup(br, if_name);
- if (port && port->n_ifaces > 1) {
- internal = true;
- }
- }
-
/* Add to datapath. */
+ internal = iface_is_internal(br, if_name);
error = dpif_port_add(br->dpif, if_name,
internal ? ODP_PORT_INTERNAL : 0, NULL);
if (error == EFBIG) {
LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
for (i = 0; i < br->n_ports; i++) {
struct port *port = br->ports[i];
+
port_update_vlan_compat(port);
port_update_bonding(port);
}
}
LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
brstp_reconfigure(br);
- iterate_and_prune_ifaces(br, set_iface_policing, NULL);
+ iterate_and_prune_ifaces(br, set_iface_properties, NULL);
}
}
\f
/* Bridge unixctl user interface functions. */
static void
-bridge_unixctl_fdb_show(struct unixctl_conn *conn, const char *args)
+bridge_unixctl_fdb_show(struct unixctl_conn *conn,
+ const char *args, void *aux UNUSED)
{
struct ds ds = DS_EMPTY_INITIALIZER;
const struct bridge *br;
int error;
assert(!bridge_lookup(name));
- br = xcalloc(1, sizeof *br);
+ br = xzalloc(sizeof *br);
error = dpif_create(name, &br->dpif);
if (error == EEXIST || error == EBUSY) {
/* Handle requests for a listing of all flows known by the OpenFlow
* stack, including those normally hidden. */
static void
-bridge_unixctl_dump_flows(struct unixctl_conn *conn, const char *args)
+bridge_unixctl_dump_flows(struct unixctl_conn *conn,
+ const char *args, void *aux UNUSED)
{
struct bridge *br;
struct ds results;
if (port_includes_vlan(port, m->out_vlan)
&& set_dst(dst, flow, in_port, port, tags))
{
+ int flow_vlan;
+
if (port->vlan < 0) {
dst->vlan = m->out_vlan;
}
if (dst_is_duplicate(dsts, dst - dsts, dst)) {
continue;
}
- if (dst->dp_ifidx == flow->in_port
- && dst->vlan == vlan) {
+
+ /* Use the vlan tag on the original flow instead of
+ * the one passed in the vlan parameter. This ensures
+ * that we compare the vlan from before any implicit
+ * tagging tags place. This is necessary because
+ * dst->vlan is the final vlan, after removing implicit
+ * tags. */
+ flow_vlan = ntohs(flow->dl_vlan);
+ if (flow_vlan == 0) {
+ flow_vlan = OFP_VLAN_NONE;
+ }
+ if (port == in_port && dst->vlan == flow_vlan) {
/* Don't send out input port on same VLAN. */
continue;
}
goto done;
}
- /* Multicast (and broadcast) packets on bonds need special attention, to
- * avoid receiving duplicates. */
- if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) {
- *tags |= in_port->active_iface_tag;
- if (in_port->active_iface != in_iface->port_ifidx) {
- /* Drop all multicast packets on inactive slaves. */
- goto done;
- } else {
- /* Drop all multicast packets for which we have learned a different
- * input port, because we probably sent the packet on one slave
- * and got it back on the active slave. Broadcast ARP replies are
- * an exception to this rule: the host has moved to another
- * switch. */
- int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
- if (src_idx != -1 && src_idx != in_port->port_idx) {
- if (packet) {
- if (!is_bcast_arp_reply(flow, packet)) {
- goto done;
- }
- } else {
- /* No way to know whether it's an ARP reply, because the
- * flow entry doesn't include enough information and we
- * don't have a packet. Punt. */
- return false;
- }
+ /* Packets received on bonds need special attention to avoid duplicates. */
+ if (in_port->n_ifaces > 1) {
+ int src_idx;
+
+ if (eth_addr_is_multicast(flow->dl_dst)) {
+ *tags |= in_port->active_iface_tag;
+ if (in_port->active_iface != in_iface->port_ifidx) {
+ /* Drop all multicast packets on inactive slaves. */
+ goto done;
}
}
+
+ /* Drop all packets for which we have learned a different input
+ * port, because we probably sent the packet on one slave and got
+ * it back on the other. Broadcast ARP replies are an exception
+ * to this rule: the host has moved to another switch. */
+ src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
+ if (src_idx != -1 && src_idx != in_port->port_idx &&
+ (!packet || !is_bcast_arp_reply(flow, packet))) {
+ goto done;
+ }
}
/* MAC learning. */
tags);
if (out_port_idx >= 0 && out_port_idx < br->n_ports) {
out_port = br->ports[out_port_idx];
+ } else if (!packet) {
+ /* If we are revalidating but don't have a learning entry then
+ * eject the flow. Installing a flow that floods packets will
+ * prevent us from seeing future packets and learning properly. */
+ return false;
}
}
/* Shifts 'hash' from 'from' to 'to' within 'port'. */
static void
bond_shift_load(struct slave_balance *from, struct slave_balance *to,
- struct bond_entry *hash)
+ int hash_idx)
{
+ struct bond_entry *hash = from->hashes[hash_idx];
struct port *port = from->iface->port;
uint64_t delta = hash->tx_bytes;
* it require more work, the only purpose it would be to allow that hash to
* be migrated to another slave in this rebalancing run, and there is no
* point in doing that. */
- if (from->hashes[0] == hash) {
+ if (hash_idx == 0) {
from->hashes++;
} else {
- int i = hash - from->hashes[0];
- memmove(from->hashes + i, from->hashes + i + 1,
- (from->n_hashes - (i + 1)) * sizeof *from->hashes);
+ memmove(from->hashes + hash_idx, from->hashes + hash_idx + 1,
+ (from->n_hashes - (hash_idx + 1)) * sizeof *from->hashes);
}
from->n_hashes--;
/* 'from' is carrying significantly more load than 'to', and that
* load is split across at least two different hashes. Pick a hash
* to migrate to 'to' (the least-loaded slave), given that doing so
- * must not cause 'to''s load to exceed 'from''s load.
+ * must decrease the ratio of the load on the two slaves by at
+ * least 0.1.
*
* The sort order we use means that we prefer to shift away the
* smallest hashes instead of the biggest ones. There is little
* reason behind this decision; we could use the opposite sort
* order to shift away big hashes ahead of small ones. */
size_t i;
+ bool order_swapped;
for (i = 0; i < from->n_hashes; i++) {
+ double old_ratio, new_ratio;
uint64_t delta = from->hashes[i]->tx_bytes;
- if (to->tx_bytes + delta < from->tx_bytes - delta) {
+
+ if (delta == 0 || from->tx_bytes - delta == 0) {
+ /* Pointless move. */
+ continue;
+ }
+
+ order_swapped = from->tx_bytes - delta < to->tx_bytes + delta;
+
+ if (to->tx_bytes == 0) {
+ /* Nothing on the new slave, move it. */
+ break;
+ }
+
+ old_ratio = (double)from->tx_bytes / to->tx_bytes;
+ new_ratio = (double)(from->tx_bytes - delta) /
+ (to->tx_bytes + delta);
+
+ if (new_ratio == 0) {
+ /* Should already be covered but check to prevent division
+ * by zero. */
+ continue;
+ }
+
+ if (new_ratio < 1) {
+ new_ratio = 1 / new_ratio;
+ }
+
+ if (old_ratio - new_ratio > 0.1) {
+ /* Would decrease the ratio, move it. */
break;
}
}
if (i < from->n_hashes) {
- bond_shift_load(from, to, from->hashes[i]);
+ bond_shift_load(from, to, i);
+ port->bond_compat_is_stale = true;
+
+ /* If the result of the migration changed the relative order of
+ * 'from' and 'to' swap them back to maintain invariants. */
+ if (order_swapped) {
+ swap_bals(from, to);
+ }
/* Re-sort 'bals'. Note that this may make 'from' and 'to'
* point to different slave_balance structures. It is only
} else {
from++;
}
- port->bond_compat_is_stale = true;
}
}
/* Bonding unixctl user interface functions. */
static void
-bond_unixctl_list(struct unixctl_conn *conn, const char *args UNUSED)
+bond_unixctl_list(struct unixctl_conn *conn,
+ const char *args UNUSED, void *aux UNUSED)
{
struct ds ds = DS_EMPTY_INITIALIZER;
const struct bridge *br;
}
static void
-bond_unixctl_show(struct unixctl_conn *conn, const char *args)
+bond_unixctl_show(struct unixctl_conn *conn,
+ const char *args, void *aux UNUSED)
{
struct ds ds = DS_EMPTY_INITIALIZER;
const struct port *port;
}
static void
-bond_unixctl_migrate(struct unixctl_conn *conn, const char *args_)
+bond_unixctl_migrate(struct unixctl_conn *conn, const char *args_,
+ void *aux UNUSED)
{
char *args = (char *) args_;
char *save_ptr = NULL;
}
static void
-bond_unixctl_set_active_slave(struct unixctl_conn *conn, const char *args_)
+bond_unixctl_set_active_slave(struct unixctl_conn *conn, const char *args_,
+ void *aux UNUSED)
{
char *args = (char *) args_;
char *save_ptr = NULL;
}
static void
-bond_unixctl_enable_slave(struct unixctl_conn *conn, const char *args)
+bond_unixctl_enable_slave(struct unixctl_conn *conn, const char *args,
+ void *aux UNUSED)
{
enable_slave(conn, args, true);
}
static void
-bond_unixctl_disable_slave(struct unixctl_conn *conn, const char *args)
+bond_unixctl_disable_slave(struct unixctl_conn *conn, const char *args,
+ void *aux UNUSED)
{
enable_slave(conn, args, false);
}
static void
-bond_unixctl_hash(struct unixctl_conn *conn, const char *args)
+bond_unixctl_hash(struct unixctl_conn *conn, const char *args,
+ void *aux UNUSED)
{
uint8_t mac[ETH_ADDR_LEN];
uint8_t hash;
static void
bond_init(void)
{
- unixctl_command_register("bond/list", bond_unixctl_list);
- unixctl_command_register("bond/show", bond_unixctl_show);
- unixctl_command_register("bond/migrate", bond_unixctl_migrate);
+ unixctl_command_register("bond/list", bond_unixctl_list, NULL);
+ unixctl_command_register("bond/show", bond_unixctl_show, NULL);
+ unixctl_command_register("bond/migrate", bond_unixctl_migrate, NULL);
unixctl_command_register("bond/set-active-slave",
- bond_unixctl_set_active_slave);
- unixctl_command_register("bond/enable-slave", bond_unixctl_enable_slave);
- unixctl_command_register("bond/disable-slave", bond_unixctl_disable_slave);
- unixctl_command_register("bond/hash", bond_unixctl_hash);
+ bond_unixctl_set_active_slave, NULL);
+ unixctl_command_register("bond/enable-slave", bond_unixctl_enable_slave,
+ NULL);
+ unixctl_command_register("bond/disable-slave", bond_unixctl_disable_slave,
+ NULL);
+ unixctl_command_register("bond/hash", bond_unixctl_hash, NULL);
}
\f
/* Port functions. */
{
struct port *port;
- port = xcalloc(1, sizeof *port);
+ port = xzalloc(sizeof *port);
port->bridge = br;
port->port_idx = br->n_ports;
port->vlan = -1;
netdev_get_etheraddr(iface->netdev, slave->mac);
}
+ if (cfg_get_bool(0, "bonding.%s.fake-iface", port->name)) {
+ struct netdev *bond_netdev;
+
+ if (!netdev_open(port->name, NETDEV_ETH_TYPE_NONE, &bond_netdev)) {
+ if (bond.up) {
+ netdev_turn_flags_on(bond_netdev, NETDEV_UP, true);
+ } else {
+ netdev_turn_flags_off(bond_netdev, NETDEV_UP, true);
+ }
+ netdev_close(bond_netdev);
+ }
+ }
+
proc_net_compat_update_bond(port->name, &bond);
free(bond.slaves);
}
{
struct iface *iface;
- iface = xcalloc(1, sizeof *iface);
+ iface = xzalloc(sizeof *iface);
iface->port = port;
iface->port_ifidx = port->n_ifaces;
iface->name = xstrdup(name);
{
return port_array_get(&br->ifaces, dp_ifidx);
}
+
+/* Returns true if 'iface' is the name of an "internal" interface on bridge
+ * 'br', that is, an interface that is entirely simulated within the datapath.
+ * The local port (ODPP_LOCAL) is always an internal interface. Other local
+ * interfaces are created by setting "iface.<iface>.internal = true".
+ *
+ * In addition, we have a kluge-y feature that creates an internal port with
+ * the name of a bonded port if "bonding.<bondname>.fake-iface = true" is set.
+ * This feature needs to go away in the long term. Until then, this is one
+ * reason why this function takes a name instead of a struct iface: the fake
+ * interfaces created this way do not have a struct iface. */
+static bool
+iface_is_internal(const struct bridge *br, const char *iface)
+{
+ if (!strcmp(iface, br->name)
+ || cfg_get_bool(0, "iface.%s.internal", iface)) {
+ return true;
+ }
+
+ if (cfg_get_bool(0, "bonding.%s.fake-iface", iface)) {
+ struct port *port = port_lookup(br, iface);
+ if (port && port->n_ifaces > 1) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/* Set Ethernet address of 'iface', if one is specified in the configuration
+ * file. */
+static void
+iface_set_mac(struct iface *iface)
+{
+ uint64_t mac = cfg_get_mac(0, "iface.%s.mac", iface->name);
+ if (mac) {
+ static uint8_t ea[ETH_ADDR_LEN];
+
+ eth_addr_from_uint64(mac, ea);
+ if (eth_addr_is_multicast(ea)) {
+ VLOG_ERR("interface %s: cannot set MAC to multicast address",
+ iface->name);
+ } else if (iface->dp_ifidx == ODPP_LOCAL) {
+ VLOG_ERR("ignoring iface.%s.mac; use bridge.%s.mac instead",
+ iface->name, iface->name);
+ } else {
+ int error = netdev_set_etheraddr(iface->netdev, ea);
+ if (error) {
+ VLOG_ERR("interface %s: setting MAC failed (%s)",
+ iface->name, strerror(error));
+ }
+ }
+ }
+}
\f
/* Port mirroring. */
VLOG_INFO("created port mirror %s on bridge %s", name, br->name);
bridge_flush(br);
- br->mirrors[i] = m = xcalloc(1, sizeof *m);
+ br->mirrors[i] = m = xzalloc(sizeof *m);
m->bridge = br;
m->idx = i;
m->name = xstrdup(name);