From: Jesse Gross Date: Tue, 10 Nov 2009 23:12:01 +0000 (-0800) Subject: Merge citrix branch into master. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d65349ea28bb67a0062a9b4b60ff97538206373b;p=openvswitch Merge citrix branch into master. --- d65349ea28bb67a0062a9b4b60ff97538206373b diff --cc datapath/datapath.c index 6e97c345,1b5f57f8..178ff2d8 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@@ -1011,18 -1016,22 +1011,18 @@@ static int del_flow(struct datapath *dp if (!flow) goto error; - if (cmd == ODP_FLOW_DEL) { - /* XXX redundant lookup */ - error = dp_table_delete(table, flow); - if (error) - goto error; + /* XXX redundant lookup */ + error = dp_table_delete(table, flow); + if (error) + goto error; - /* XXX These statistics might lose a few packets, since other - * CPUs can be using this flow. We used to synchronize_rcu() - * to make sure that we get completely accurate stats, but that - * blows our performance, badly. */ - dp->n_flows--; - error = answer_query(flow, 0, ufp); - flow_deferred_free(flow); - } else { - error = answer_query(flow, uf.flags, ufp); - } + /* XXX These statistics might lose a few packets, since other CPUs can + * be using this flow. We used to synchronize_rcu() to make sure that + * we get completely accurate stats, but that blows our performance, + * badly. */ + dp->n_flows--; - error = answer_query(flow, uf.flags, ufp); ++ error = answer_query(flow, 0, ufp); + flow_deferred_free(flow); error: return error; @@@ -1044,9 -1054,9 +1044,9 @@@ static int query_flows(struct datapath flow = dp_table_lookup(table, &uf.key); if (!flow) - error = __clear_user(&ufp->stats, sizeof ufp->stats); + error = __put_user(ENOENT, &ufp->stats.error); else -- error = answer_query(flow, 0, ufp); ++ error = answer_query(flow, uf.flags, ufp); if (error) return -EFAULT; } diff --cc lib/dpif-linux.c index 8216d187,00000000..2bf329f4 mode 100644,000000..100644 --- a/lib/dpif-linux.c +++ b/lib/dpif-linux.c @@@ -1,762 -1,0 +1,762 @@@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "dpif.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpif-provider.h" +#include "ofpbuf.h" +#include "poll-loop.h" +#include "rtnetlink.h" +#include "svec.h" +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_dpif_linux + +/* Datapath interface for the openvswitch Linux kernel module. */ +struct dpif_linux { + struct dpif dpif; + int fd; + + /* Used by dpif_linux_get_all_names(). */ + char *local_ifname; + int minor; + + /* Change notification. */ + int local_ifindex; /* Ifindex of local port. */ + struct svec changed_ports; /* Ports that have changed. */ + struct rtnetlink_notifier port_notifier; + bool change_error; +}; + +static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(9999, 5); + +static int do_ioctl(const struct dpif *, int cmd, const void *arg); +static int lookup_minor(const char *name, int *minor); +static int finish_open(struct dpif *, const char *local_ifname); +static int get_openvswitch_major(void); +static int create_minor(const char *name, int minor, struct dpif **dpifp); +static int open_minor(int minor, struct dpif **dpifp); +static int make_openvswitch_device(int minor, char **fnp); +static void dpif_linux_port_changed(const struct rtnetlink_change *, + void *dpif); + +static struct dpif_linux * +dpif_linux_cast(const struct dpif *dpif) +{ + dpif_assert_class(dpif, &dpif_linux_class); + return CONTAINER_OF(dpif, struct dpif_linux, dpif); +} + +static int +dpif_linux_enumerate(struct svec *all_dps) +{ + int major; + int error; + int i; + + /* Check that the Open vSwitch module is loaded. */ + major = get_openvswitch_major(); + if (major < 0) { + return -major; + } + + error = 0; + for (i = 0; i < ODP_MAX; i++) { + struct dpif *dpif; + char devname[16]; + int retval; + + sprintf(devname, "dp%d", i); + retval = dpif_open(devname, &dpif); + if (!retval) { + svec_add(all_dps, devname); + dpif_close(dpif); + } else if (retval != ENODEV && !error) { + error = retval; + } + } + return error; +} + +static int +dpif_linux_open(const char *name UNUSED, char *suffix, bool create, + struct dpif **dpifp) +{ + int minor; + + minor = !strncmp(name, "dp", 2) + && isdigit((unsigned char)name[2]) ? atoi(name + 2) : -1; + if (create) { + if (minor >= 0) { + return create_minor(suffix, minor, dpifp); + } else { + /* Scan for unused minor number. */ + for (minor = 0; minor < ODP_MAX; minor++) { + int error = create_minor(suffix, minor, dpifp); + if (error != EBUSY) { + return error; + } + } + + /* All datapath numbers in use. */ + return ENOBUFS; + } + } else { + struct dpif_linux *dpif; + struct odp_port port; + int error; + + if (minor < 0) { + error = lookup_minor(suffix, &minor); + if (error) { + return error; + } + } + + error = open_minor(minor, dpifp); + if (error) { + return error; + } + dpif = dpif_linux_cast(*dpifp); + + /* We need the local port's ifindex for the poll function. Start by + * getting the local port's name. */ + memset(&port, 0, sizeof port); + port.port = ODPP_LOCAL; + if (ioctl(dpif->fd, ODP_PORT_QUERY, &port)) { + error = errno; + if (error != ENODEV) { + VLOG_WARN("%s: probe returned unexpected error: %s", + dpif_name(*dpifp), strerror(error)); + } + dpif_close(*dpifp); + return error; + } + + /* Then use that to finish up opening. */ + return finish_open(&dpif->dpif, port.devname); + } +} + +static void +dpif_linux_close(struct dpif *dpif_) +{ + struct dpif_linux *dpif = dpif_linux_cast(dpif_); + rtnetlink_notifier_unregister(&dpif->port_notifier); + svec_destroy(&dpif->changed_ports); + free(dpif->local_ifname); + close(dpif->fd); + free(dpif); +} + +static int +dpif_linux_get_all_names(const struct dpif *dpif_, struct svec *all_names) +{ + struct dpif_linux *dpif = dpif_linux_cast(dpif_); + + svec_add_nocopy(all_names, xasprintf("dp%d", dpif->minor)); + svec_add(all_names, dpif->local_ifname); + return 0; +} + +static int +dpif_linux_delete(struct dpif *dpif_) +{ + return do_ioctl(dpif_, ODP_DP_DESTROY, NULL); +} + +static int +dpif_linux_get_stats(const struct dpif *dpif_, struct odp_stats *stats) +{ + return do_ioctl(dpif_, ODP_DP_STATS, stats); +} + +static int +dpif_linux_get_drop_frags(const struct dpif *dpif_, bool *drop_fragsp) +{ + int drop_frags; + int error; + + error = do_ioctl(dpif_, ODP_GET_DROP_FRAGS, &drop_frags); + if (!error) { + *drop_fragsp = drop_frags & 1; + } + return error; +} + +static int +dpif_linux_set_drop_frags(struct dpif *dpif_, bool drop_frags) +{ + int drop_frags_int = drop_frags; + return do_ioctl(dpif_, ODP_SET_DROP_FRAGS, &drop_frags_int); +} + +static int +dpif_linux_port_add(struct dpif *dpif_, const char *devname, uint16_t flags, + uint16_t *port_no) +{ + struct odp_port port; + int error; + + memset(&port, 0, sizeof port); + strncpy(port.devname, devname, sizeof port.devname); + port.flags = flags; + error = do_ioctl(dpif_, ODP_PORT_ADD, &port); + if (!error) { + *port_no = port.port; + } + return error; +} + +static int +dpif_linux_port_del(struct dpif *dpif_, uint16_t port_no) +{ + int tmp = port_no; + return do_ioctl(dpif_, ODP_PORT_DEL, &tmp); +} + +static int +dpif_linux_port_query_by_number(const struct dpif *dpif_, uint16_t port_no, + struct odp_port *port) +{ + memset(port, 0, sizeof *port); + port->port = port_no; + return do_ioctl(dpif_, ODP_PORT_QUERY, port); +} + +static int +dpif_linux_port_query_by_name(const struct dpif *dpif_, const char *devname, + struct odp_port *port) +{ + memset(port, 0, sizeof *port); + strncpy(port->devname, devname, sizeof port->devname); + return do_ioctl(dpif_, ODP_PORT_QUERY, port); +} + +static int +dpif_linux_flow_flush(struct dpif *dpif_) +{ + return do_ioctl(dpif_, ODP_FLOW_FLUSH, NULL); +} + +static int +dpif_linux_port_list(const struct dpif *dpif_, struct odp_port *ports, int n) +{ + struct odp_portvec pv; + int error; + + pv.ports = ports; + pv.n_ports = n; + error = do_ioctl(dpif_, ODP_PORT_LIST, &pv); + return error ? -error : pv.n_ports; +} + +static int +dpif_linux_port_poll(const struct dpif *dpif_, char **devnamep) +{ + struct dpif_linux *dpif = dpif_linux_cast(dpif_); + + if (dpif->change_error) { + dpif->change_error = false; + svec_clear(&dpif->changed_ports); + return ENOBUFS; + } else if (dpif->changed_ports.n) { + *devnamep = dpif->changed_ports.names[--dpif->changed_ports.n]; + return 0; + } else { + return EAGAIN; + } +} + +static void +dpif_linux_port_poll_wait(const struct dpif *dpif_) +{ + struct dpif_linux *dpif = dpif_linux_cast(dpif_); + if (dpif->changed_ports.n || dpif->change_error) { + poll_immediate_wake(); + } else { + rtnetlink_notifier_wait(); + } +} + +static int +dpif_linux_port_group_get(const struct dpif *dpif_, int group, + uint16_t ports[], int n) +{ + struct odp_port_group pg; + int error; + + assert(n <= UINT16_MAX); + pg.group = group; + pg.ports = ports; + pg.n_ports = n; + error = do_ioctl(dpif_, ODP_PORT_GROUP_GET, &pg); + return error ? -error : pg.n_ports; +} + +static int +dpif_linux_port_group_set(struct dpif *dpif_, int group, + const uint16_t ports[], int n) +{ + struct odp_port_group pg; + + assert(n <= UINT16_MAX); + pg.group = group; + pg.ports = (uint16_t *) ports; + pg.n_ports = n; + return do_ioctl(dpif_, ODP_PORT_GROUP_SET, &pg); +} + +static int +dpif_linux_flow_get(const struct dpif *dpif_, struct odp_flow flows[], int n) +{ + struct odp_flowvec fv; + fv.flows = flows; + fv.n_flows = n; + return do_ioctl(dpif_, ODP_FLOW_GET, &fv); +} + +static int +dpif_linux_flow_put(struct dpif *dpif_, struct odp_flow_put *put) +{ + return do_ioctl(dpif_, ODP_FLOW_PUT, put); +} + +static int +dpif_linux_flow_del(struct dpif *dpif_, struct odp_flow *flow) +{ + return do_ioctl(dpif_, ODP_FLOW_DEL, flow); +} + +static int +dpif_linux_flow_list(const struct dpif *dpif_, struct odp_flow flows[], int n) +{ + struct odp_flowvec fv; + int error; + + fv.flows = flows; + fv.n_flows = n; + error = do_ioctl(dpif_, ODP_FLOW_LIST, &fv); + return error ? -error : fv.n_flows; +} + +static int +dpif_linux_execute(struct dpif *dpif_, uint16_t in_port, + const union odp_action actions[], int n_actions, + const struct ofpbuf *buf) +{ + struct odp_execute execute; + memset(&execute, 0, sizeof execute); + execute.in_port = in_port; + execute.actions = (union odp_action *) actions; + execute.n_actions = n_actions; + execute.data = buf->data; + execute.length = buf->size; + return do_ioctl(dpif_, ODP_EXECUTE, &execute); +} + +static int +dpif_linux_recv_get_mask(const struct dpif *dpif_, int *listen_mask) +{ + return do_ioctl(dpif_, ODP_GET_LISTEN_MASK, listen_mask); +} + +static int +dpif_linux_recv_set_mask(struct dpif *dpif_, int listen_mask) +{ + return do_ioctl(dpif_, ODP_SET_LISTEN_MASK, &listen_mask); +} + +static int +dpif_linux_recv(struct dpif *dpif_, struct ofpbuf **bufp) +{ + struct dpif_linux *dpif = dpif_linux_cast(dpif_); + struct ofpbuf *buf; + int retval; + int error; + + buf = ofpbuf_new(65536); + retval = read(dpif->fd, ofpbuf_tail(buf), ofpbuf_tailroom(buf)); + if (retval < 0) { + error = errno; + if (error != EAGAIN) { + VLOG_WARN_RL(&error_rl, "%s: read failed: %s", + dpif_name(dpif_), strerror(error)); + } + } else if (retval >= sizeof(struct odp_msg)) { + struct odp_msg *msg = buf->data; + if (msg->length <= retval) { + buf->size += retval; + *bufp = buf; + return 0; + } else { + VLOG_WARN_RL(&error_rl, "%s: discarding message truncated " - "from %zu bytes to %d", ++ "from %"PRIu32" bytes to %d", + dpif_name(dpif_), msg->length, retval); + error = ERANGE; + } + } else if (!retval) { + VLOG_WARN_RL(&error_rl, "%s: unexpected end of file", dpif_name(dpif_)); + error = EPROTO; + } else { + VLOG_WARN_RL(&error_rl, + "%s: discarding too-short message (%d bytes)", + dpif_name(dpif_), retval); + error = ERANGE; + } + + *bufp = NULL; + ofpbuf_delete(buf); + return error; +} + +static void +dpif_linux_recv_wait(struct dpif *dpif_) +{ + struct dpif_linux *dpif = dpif_linux_cast(dpif_); + poll_fd_wait(dpif->fd, POLLIN); +} + +const struct dpif_class dpif_linux_class = { + "", /* This is the default class. */ + "linux", + NULL, + NULL, + dpif_linux_enumerate, + dpif_linux_open, + dpif_linux_close, + dpif_linux_get_all_names, + dpif_linux_delete, + dpif_linux_get_stats, + dpif_linux_get_drop_frags, + dpif_linux_set_drop_frags, + dpif_linux_port_add, + dpif_linux_port_del, + dpif_linux_port_query_by_number, + dpif_linux_port_query_by_name, + dpif_linux_port_list, + dpif_linux_port_poll, + dpif_linux_port_poll_wait, + dpif_linux_port_group_get, + dpif_linux_port_group_set, + dpif_linux_flow_get, + dpif_linux_flow_put, + dpif_linux_flow_del, + dpif_linux_flow_flush, + dpif_linux_flow_list, + dpif_linux_execute, + dpif_linux_recv_get_mask, + dpif_linux_recv_set_mask, + dpif_linux_recv, + dpif_linux_recv_wait, +}; + +static int get_openvswitch_major(void); +static int get_major(const char *target); + +static int +do_ioctl(const struct dpif *dpif_, int cmd, const void *arg) +{ + struct dpif_linux *dpif = dpif_linux_cast(dpif_); + return ioctl(dpif->fd, cmd, arg) ? errno : 0; +} + +static int +lookup_minor(const char *name, int *minorp) +{ + struct ethtool_drvinfo drvinfo; + int minor, port_no; + struct ifreq ifr; + int error; + int sock; + + sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock < 0) { + VLOG_WARN("socket(AF_INET) failed: %s", strerror(errno)); + error = errno; + goto error; + } + + memset(&ifr, 0, sizeof ifr); + strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name); + ifr.ifr_data = (caddr_t) &drvinfo; + + memset(&drvinfo, 0, sizeof drvinfo); + drvinfo.cmd = ETHTOOL_GDRVINFO; + if (ioctl(sock, SIOCETHTOOL, &ifr)) { + VLOG_WARN("ioctl(SIOCETHTOOL) failed: %s", strerror(errno)); + error = errno; + goto error_close_sock; + } + + if (strcmp(drvinfo.driver, "openvswitch")) { + VLOG_WARN("%s is not an openvswitch device", name); + error = EOPNOTSUPP; + goto error_close_sock; + } + + if (sscanf(drvinfo.bus_info, "%d.%d", &minor, &port_no) != 2) { + VLOG_WARN("%s ethtool bus_info has unexpected format", name); + error = EPROTOTYPE; + goto error_close_sock; + } else if (port_no != ODPP_LOCAL) { + /* This is an Open vSwitch device but not the local port. We + * intentionally support only using the name of the local port as the + * name of a datapath; otherwise, it would be too difficult to + * enumerate all the names of a datapath. */ + error = EOPNOTSUPP; + goto error_close_sock; + } + + *minorp = minor; + close(sock); + return 0; + +error_close_sock: + close(sock); +error: + return error; +} + +static int +make_openvswitch_device(int minor, char **fnp) +{ + const char dirname[] = "/dev/net"; + int major; + dev_t dev; + struct stat s; + char fn[128]; + + major = get_openvswitch_major(); + if (major < 0) { + return -major; + } + dev = makedev(major, minor); + + *fnp = NULL; + sprintf(fn, "%s/dp%d", dirname, minor); + if (!stat(fn, &s)) { + if (!S_ISCHR(s.st_mode)) { + VLOG_WARN_RL(&error_rl, "%s is not a character device, fixing", + fn); + } else if (s.st_rdev != dev) { + VLOG_WARN_RL(&error_rl, + "%s is device %u:%u but should be %u:%u, fixing", + fn, major(s.st_rdev), minor(s.st_rdev), + major(dev), minor(dev)); + } else { + goto success; + } + if (unlink(fn)) { + VLOG_WARN_RL(&error_rl, "%s: unlink failed (%s)", + fn, strerror(errno)); + return errno; + } + } else if (errno == ENOENT) { + if (stat(dirname, &s)) { + if (errno == ENOENT) { + if (mkdir(dirname, 0755)) { + VLOG_WARN_RL(&error_rl, "%s: mkdir failed (%s)", + dirname, strerror(errno)); + return errno; + } + } else { + VLOG_WARN_RL(&error_rl, "%s: stat failed (%s)", + dirname, strerror(errno)); + return errno; + } + } + } else { + VLOG_WARN_RL(&error_rl, "%s: stat failed (%s)", fn, strerror(errno)); + return errno; + } + + /* The device needs to be created. */ + if (mknod(fn, S_IFCHR | 0700, dev)) { + VLOG_WARN_RL(&error_rl, + "%s: creating character device %u:%u failed (%s)", + fn, major(dev), minor(dev), strerror(errno)); + return errno; + } + +success: + *fnp = xstrdup(fn); + return 0; +} + +/* Return the major device number of the Open vSwitch device. If it + * cannot be determined, a negative errno is returned. */ +static int +get_openvswitch_major(void) +{ + static int openvswitch_major = -1; + if (openvswitch_major < 0) { + openvswitch_major = get_major("openvswitch"); + } + return openvswitch_major; +} + +static int +get_major(const char *target) +{ + const char fn[] = "/proc/devices"; + char line[128]; + FILE *file; + int ln; + + file = fopen(fn, "r"); + if (!file) { + VLOG_ERR("opening %s failed (%s)", fn, strerror(errno)); + return -errno; + } + + for (ln = 1; fgets(line, sizeof line, file); ln++) { + char name[64]; + int major; + + if (!strncmp(line, "Character", 9) || line[0] == '\0') { + /* Nothing to do. */ + } else if (!strncmp(line, "Block", 5)) { + /* We only want character devices, so skip the rest of the file. */ + break; + } else if (sscanf(line, "%d %63s", &major, name)) { + if (!strcmp(name, target)) { + fclose(file); + return major; + } + } else { + static bool warned; + if (!warned) { + VLOG_WARN("%s:%d: syntax error", fn, ln); + } + warned = true; + } + } + + VLOG_ERR("%s: %s major not found (is the module loaded?)", fn, target); + return -ENODEV; +} + +static int +finish_open(struct dpif *dpif_, const char *local_ifname) +{ + struct dpif_linux *dpif = dpif_linux_cast(dpif_); + dpif->local_ifname = strdup(local_ifname); + dpif->local_ifindex = if_nametoindex(local_ifname); + if (!dpif->local_ifindex) { + int error = errno; + dpif_close(dpif_); + VLOG_WARN("could not get ifindex of %s device: %s", + local_ifname, strerror(errno)); + return error; + } + return 0; +} + +static int +create_minor(const char *name, int minor, struct dpif **dpifp) +{ + int error = open_minor(minor, dpifp); + if (!error) { + error = do_ioctl(*dpifp, ODP_DP_CREATE, name); + if (!error) { + error = finish_open(*dpifp, name); + } else { + dpif_close(*dpifp); + } + } + return error; +} + +static int +open_minor(int minor, struct dpif **dpifp) +{ + int error; + char *fn; + int fd; + + error = make_openvswitch_device(minor, &fn); + if (error) { + return error; + } + + fd = open(fn, O_RDONLY | O_NONBLOCK); + if (fd >= 0) { + struct dpif_linux *dpif = xmalloc(sizeof *dpif); + error = rtnetlink_notifier_register(&dpif->port_notifier, + dpif_linux_port_changed, dpif); + if (!error) { + char *name; + + name = xasprintf("dp%d", minor); + dpif_init(&dpif->dpif, &dpif_linux_class, name, minor, minor); + free(name); + + dpif->fd = fd; + dpif->local_ifname = NULL; + dpif->minor = minor; + dpif->local_ifindex = 0; + svec_init(&dpif->changed_ports); + dpif->change_error = false; + *dpifp = &dpif->dpif; + } else { + free(dpif); + } + } else { + error = errno; + VLOG_WARN("%s: open failed (%s)", fn, strerror(error)); + } + free(fn); + + return error; +} + +static void +dpif_linux_port_changed(const struct rtnetlink_change *change, void *dpif_) +{ + struct dpif_linux *dpif = dpif_; + + if (change) { + if (change->master_ifindex == dpif->local_ifindex + && (change->nlmsg_type == RTM_NEWLINK + || change->nlmsg_type == RTM_DELLINK)) + { + /* Our datapath changed, either adding a new port or deleting an + * existing one. */ + if (!svec_contains(&dpif->changed_ports, change->ifname)) { + svec_add(&dpif->changed_ports, change->ifname); + svec_sort(&dpif->changed_ports); + } + } + } else { + dpif->change_error = true; + } +} diff --cc ofproto/in-band.c index 2b362bc0,00000000..c39e5caf mode 100644,000000..100644 --- a/ofproto/in-band.c +++ b/ofproto/in-band.c @@@ -1,652 -1,0 +1,652 @@@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "in-band.h" +#include +#include +#include +#include +#include +#include +#include "dhcp.h" +#include "dpif.h" +#include "flow.h" +#include "mac-learning.h" +#include "netdev.h" +#include "odp-util.h" +#include "ofp-print.h" +#include "ofproto.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "openvswitch/datapath-protocol.h" +#include "packets.h" +#include "poll-loop.h" +#include "rconn.h" +#include "status.h" +#include "timeval.h" +#include "vconn.h" + +#define THIS_MODULE VLM_in_band +#include "vlog.h" + +/* In-band control allows a single network to be used for OpenFlow + * traffic and other data traffic. Refer to ovs-vswitchd.conf(5) and + * secchan(8) for a description of configuring in-band control. + * + * This comment is an attempt to describe how in-band control works at a + * wire- and implementation-level. Correctly implementing in-band + * control has proven difficult due to its many subtleties, and has thus + * gone through many iterations. Please read through and understand the + * reasoning behind the chosen rules before making modifications. + * + * In Open vSwitch, in-band control is implemented as "hidden" flows (in + * that they are not visible through OpenFlow) and at a higher priority - * than wildcarded flows can be setup by the controller. This is done ++ * than wildcarded flows can be set up by the controller. This is done + * so that the controller cannot interfere with them and possibly break + * connectivity with its switches. It is possible to see all flows, + * including in-band ones, with the ovs-appctl "bridge/dump-flows" + * command. + * + * The following rules are always enabled with the "normal" action by a + * switch with in-band control: + * + * a. DHCP requests sent from the local port. + * b. ARP replies to the local port's MAC address. + * c. ARP requests from the local port's MAC address. + * d. ARP replies to the remote side's MAC address. Note that the + * remote side is either the controller or the gateway to reach + * the controller. + * e. ARP requests from the remote side's MAC address. Note that + * like (d), the MAC is either for the controller or gateway. + * f. ARP replies containing the controller's IP address as a target. + * g. ARP requests containing the controller's IP address as a source. + * h. OpenFlow (6633/tcp) traffic to the controller's IP. + * i. OpenFlow (6633/tcp) traffic from the controller's IP. + * + * The goal of these rules is to be as narrow as possible to allow a + * switch to join a network and be able to communicate with a + * controller. As mentioned earlier, these rules have higher priority + * than the controller's rules, so if they are too broad, they may + * prevent the controller from implementing its policy. As such, + * in-band actively monitors some aspects of flow and packet processing + * so that the rules can be made more precise. + * + * In-band control monitors attempts to add flows into the datapath that + * could interfere with its duties. The datapath only allows exact + * match entries, so in-band control is able to be very precise about + * the flows it prevents. Flows that miss in the datapath are sent to + * userspace to be processed, so preventing these flows from being + * cached in the "fast path" does not affect correctness. The only type + * of flow that is currently prevented is one that would prevent DHCP + * replies from being seen by the local port. For example, a rule that + * forwarded all DHCP traffic to the controller would not be allowed, + * but one that forwarded to all ports (including the local port) would. + * + * As mentioned earlier, packets that miss in the datapath are sent to + * the userspace for processing. The userspace has its own flow table, + * the "classifier", so in-band checks whether any special processing + * is needed before the classifier is consulted. If a packet is a DHCP + * response to a request from the local port, the packet is forwarded to + * the local port, regardless of the flow table. Note that this requires + * L7 processing of DHCP replies to determine whether the 'chaddr' field + * matches the MAC address of the local port. + * + * It is interesting to note that for an L3-based in-band control + * mechanism, the majority of rules are devoted to ARP traffic. At first + * glance, some of these rules appear redundant. However, each serves an + * important role. First, in order to determine the MAC address of the + * remote side (controller or gateway) for other ARP rules, we must allow + * ARP traffic for our local port with rules (b) and (c). If we are + * between a switch and its connection to the controller, we have to + * allow the other switch's ARP traffic to through. This is done with + * rules (d) and (e), since we do not know the addresses of the other + * switches a priori, but do know the controller's or gateway's. Finally, + * if the controller is running in a local guest VM that is not reached + * through the local port, the switch that is connected to the VM must + * allow ARP traffic based on the controller's IP address, since it will + * not know the MAC address of the local port that is sending the traffic + * or the MAC address of the controller in the guest VM. + * + * With a few notable exceptions below, in-band should work in most + * network setups. The following are considered "supported' in the + * current implementation: + * + * - Locally Connected. The switch and controller are on the same + * subnet. This uses rules (a), (b), (c), (h), and (i). + * + * - Reached through Gateway. The switch and controller are on + * different subnets and must go through a gateway. This uses + * rules (a), (b), (c), (h), and (i). + * + * - Between Switch and Controller. This switch is between another + * switch and the controller, and we want to allow the other + * switch's traffic through. This uses rules (d), (e), (h), and + * (i). It uses (b) and (c) indirectly in order to know the MAC + * address for rules (d) and (e). Note that DHCP for the other + * switch will not work unless the controller explicitly lets this + * switch pass the traffic. + * + * - Between Switch and Gateway. This switch is between another + * switch and the gateway, and we want to allow the other switch's + * traffic through. This uses the same rules and logic as the + * "Between Switch and Controller" configuration described earlier. + * + * - Controller on Local VM. The controller is a guest VM on the + * system running in-band control. This uses rules (a), (b), (c), + * (h), and (i). + * + * - Controller on Local VM with Different Networks. The controller + * is a guest VM on the system running in-band control, but the + * local port is not used to connect to the controller. For + * example, an IP address is configured on eth0 of the switch. The + * controller's VM is connected through eth1 of the switch, but an + * IP address has not been configured for that port on the switch. + * As such, the switch will use eth0 to connect to the controller, + * and eth1's rules about the local port will not work. In the + * example, the switch attached to eth0 would use rules (a), (b), + * (c), (h), and (i) on eth0. The switch attached to eth1 would use + * rules (f), (g), (h), and (i). + * + * The following are explicitly *not* supported by in-band control: + * + * - Specify Controller by Name. Currently, the controller must be + * identified by IP address. A naive approach would be to permit + * all DNS traffic. Unfortunately, this would prevent the + * controller from defining any policy over DNS. Since switches + * that are located behind us need to connect to the controller, + * in-band cannot simply add a rule that allows DNS traffic from + * the local port. The "correct" way to support this is to parse + * DNS requests to allow all traffic related to a request for the + * controller's name through. Due to the potential security + * problems and amount of processing, we decided to hold off for + * the time-being. + * + * - Multiple Controllers. There is nothing intrinsic in the high- + * level design that prevents using multiple (known) controllers, + * however, the current implementation's data structures assume + * only one. + * + * - Differing Controllers for Switches. All switches must know + * the L3 addresses for all the controllers that other switches - * may use, since rules need to be setup to allow traffic related ++ * may use, since rules need to be set up to allow traffic related + * to those controllers through. See rules (f), (g), (h), and (i). + * + * - Differing Routes for Switches. In order for the switch to + * allow other switches to connect to a controller through a + * gateway, it allows the gateway's traffic through with rules (d) + * and (e). If the routes to the controller differ for the two + * switches, we will not know the MAC address of the alternate + * gateway. + */ + +#define IB_BASE_PRIORITY 18181800 + +enum { + IBR_FROM_LOCAL_DHCP, /* (a) From local port, DHCP. */ + IBR_TO_LOCAL_ARP, /* (b) To local port, ARP. */ + IBR_FROM_LOCAL_ARP, /* (c) From local port, ARP. */ + IBR_TO_REMOTE_ARP, /* (d) To remote MAC, ARP. */ + IBR_FROM_REMOTE_ARP, /* (e) From remote MAC, ARP. */ + IBR_TO_CTL_ARP, /* (f) To controller IP, ARP. */ + IBR_FROM_CTL_ARP, /* (g) From controller IP, ARP. */ + IBR_TO_CTL_OFP, /* (h) To controller, OpenFlow port. */ + IBR_FROM_CTL_OFP, /* (i) From controller, OpenFlow port. */ +#if OFP_TCP_PORT != OFP_SSL_PORT +#error Need to support separate TCP and SSL flows. +#endif + N_IB_RULES +}; + +struct ib_rule { + bool installed; + flow_t flow; + uint32_t wildcards; + unsigned int priority; +}; + +struct in_band { + struct ofproto *ofproto; + struct rconn *controller; + struct status_category *ss_cat; + + /* Keep track of local port's information. */ + uint8_t local_mac[ETH_ADDR_LEN]; /* Current MAC. */ + struct netdev *local_netdev; /* Local port's network device. */ + time_t next_local_refresh; + + /* Keep track of controller and next hop's information. */ + uint32_t controller_ip; /* Controller IP, 0 if unknown. */ + uint8_t remote_mac[ETH_ADDR_LEN]; /* Remote MAC. */ + struct netdev *remote_netdev; + uint8_t last_remote_mac[ETH_ADDR_LEN]; /* Previous remote MAC. */ + time_t next_remote_refresh; + + /* Rules that we set up. */ + struct ib_rule rules[N_IB_RULES]; +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60); + +static const uint8_t * +get_remote_mac(struct in_band *ib) +{ + int retval; + bool have_mac; + struct in_addr c_in4; /* Controller's IP address. */ + struct in_addr r_in4; /* Next hop IP address. */ + char *next_hop_dev; + time_t now = time_now(); + + if (now >= ib->next_remote_refresh) { + /* Find the next-hop IP address. */ + c_in4.s_addr = ib->controller_ip; + memset(ib->remote_mac, 0, sizeof ib->remote_mac); + retval = netdev_get_next_hop(ib->local_netdev, + &c_in4, &r_in4, &next_hop_dev); + if (retval) { + VLOG_WARN("cannot find route for controller ("IP_FMT"): %s", + IP_ARGS(&ib->controller_ip), strerror(retval)); + ib->next_remote_refresh = now + 1; + return NULL; + } + if (!r_in4.s_addr) { + r_in4.s_addr = c_in4.s_addr; + } + + /* Get the next-hop IP and network device. */ + if (!ib->remote_netdev + || strcmp(netdev_get_name(ib->remote_netdev), next_hop_dev)) + { + netdev_close(ib->remote_netdev); + retval = netdev_open(next_hop_dev, NETDEV_ETH_TYPE_NONE, + &ib->remote_netdev); + if (retval) { + VLOG_WARN_RL(&rl, "cannot open netdev %s (next hop " + "to controller "IP_FMT"): %s", + next_hop_dev, IP_ARGS(&ib->controller_ip), + strerror(retval)); + ib->next_remote_refresh = now + 1; + return NULL; + } + } + + /* Look up the MAC address of the next-hop IP address. */ + retval = netdev_arp_lookup(ib->remote_netdev, r_in4.s_addr, + ib->remote_mac); + if (retval) { + VLOG_DBG_RL(&rl, "cannot look up remote MAC address ("IP_FMT"): %s", + IP_ARGS(&r_in4.s_addr), strerror(retval)); + } + have_mac = !eth_addr_is_zero(ib->remote_mac); + free(next_hop_dev); + if (have_mac + && !eth_addr_equals(ib->last_remote_mac, ib->remote_mac)) { + VLOG_DBG("remote MAC address changed from "ETH_ADDR_FMT" to " + ETH_ADDR_FMT, + ETH_ADDR_ARGS(ib->last_remote_mac), + ETH_ADDR_ARGS(ib->remote_mac)); + memcpy(ib->last_remote_mac, ib->remote_mac, ETH_ADDR_LEN); + } + + /* Schedule next refresh. + * + * If we have an IP address but not a MAC address, then refresh + * quickly, since we probably will get a MAC address soon (via ARP). + * Otherwise, we can afford to wait a little while. */ + ib->next_remote_refresh + = now + (!ib->controller_ip || have_mac ? 10 : 1); + } + + return !eth_addr_is_zero(ib->remote_mac) ? ib->remote_mac : NULL; +} + +static const uint8_t * +get_local_mac(struct in_band *ib) +{ + time_t now = time_now(); + if (now >= ib->next_local_refresh) { + uint8_t ea[ETH_ADDR_LEN]; + if (ib->local_netdev && !netdev_get_etheraddr(ib->local_netdev, ea)) { + memcpy(ib->local_mac, ea, ETH_ADDR_LEN); + } + ib->next_local_refresh = now + 1; + } + return !eth_addr_is_zero(ib->local_mac) ? ib->local_mac : NULL; +} + +static void +in_band_status_cb(struct status_reply *sr, void *in_band_) +{ + struct in_band *in_band = in_band_; + + if (!eth_addr_is_zero(in_band->local_mac)) { + status_reply_put(sr, "local-mac="ETH_ADDR_FMT, + ETH_ADDR_ARGS(in_band->local_mac)); + } + + if (!eth_addr_is_zero(in_band->remote_mac)) { + status_reply_put(sr, "remote-mac="ETH_ADDR_FMT, + ETH_ADDR_ARGS(in_band->remote_mac)); + } +} + +static void +drop_flow(struct in_band *in_band, int rule_idx) +{ + struct ib_rule *rule = &in_band->rules[rule_idx]; + + if (rule->installed) { + rule->installed = false; + ofproto_delete_flow(in_band->ofproto, &rule->flow, rule->wildcards, + rule->priority); + } +} + +/* out_port and fixed_fields are assumed never to change. */ +static void - setup_flow(struct in_band *in_band, int rule_idx, const flow_t *flow, - uint32_t fixed_fields, uint16_t out_port) ++set_up_flow(struct in_band *in_band, int rule_idx, const flow_t *flow, ++ uint32_t fixed_fields, uint16_t out_port) +{ + struct ib_rule *rule = &in_band->rules[rule_idx]; + + if (!rule->installed || memcmp(flow, &rule->flow, sizeof *flow)) { + union ofp_action action; + + drop_flow(in_band, rule_idx); + + rule->installed = true; + rule->flow = *flow; + rule->wildcards = OFPFW_ALL & ~fixed_fields; + rule->priority = IB_BASE_PRIORITY + (N_IB_RULES - rule_idx); + + action.type = htons(OFPAT_OUTPUT); + action.output.len = htons(sizeof action); + action.output.port = htons(out_port); + action.output.max_len = htons(0); + ofproto_add_flow(in_band->ofproto, &rule->flow, rule->wildcards, + rule->priority, &action, 1, 0); + } +} + +/* Returns true if 'packet' should be sent to the local port regardless + * of the flow table. */ +bool +in_band_msg_in_hook(struct in_band *in_band, const flow_t *flow, + const struct ofpbuf *packet) +{ + if (!in_band) { + return false; + } + + /* Regardless of how the flow table is configured, we want to be + * able to see replies to our DHCP requests. */ + if (flow->dl_type == htons(ETH_TYPE_IP) + && flow->nw_proto == IP_TYPE_UDP + && flow->tp_src == htons(DHCP_SERVER_PORT) + && flow->tp_dst == htons(DHCP_CLIENT_PORT) + && packet->l7) { + struct dhcp_header *dhcp; + const uint8_t *local_mac; + + dhcp = ofpbuf_at(packet, (char *)packet->l7 - (char *)packet->data, + sizeof *dhcp); + if (!dhcp) { + return false; + } + + local_mac = get_local_mac(in_band); + if (eth_addr_equals(dhcp->chaddr, local_mac)) { + return true; + } + } + + return false; +} + +/* Returns true if the rule that would match 'flow' with 'actions' is + * allowed to be set up in the datapath. */ +bool +in_band_rule_check(struct in_band *in_band, const flow_t *flow, + const struct odp_actions *actions) +{ + if (!in_band) { + return true; + } + + /* Don't allow flows that would prevent DHCP replies from being seen + * by the local port. */ + if (flow->dl_type == htons(ETH_TYPE_IP) + && flow->nw_proto == IP_TYPE_UDP + && flow->tp_src == htons(DHCP_SERVER_PORT) + && flow->tp_dst == htons(DHCP_CLIENT_PORT)) { + int i; + + for (i=0; in_actions; i++) { + if (actions->actions[i].output.type == ODPAT_OUTPUT + && actions->actions[i].output.port == ODPP_LOCAL) { + return true; + } + } + return false; + } + + return true; +} + +void +in_band_run(struct in_band *in_band) +{ + time_t now = time_now(); + uint32_t controller_ip; + const uint8_t *remote_mac; + const uint8_t *local_mac; + flow_t flow; + + if (now < in_band->next_remote_refresh + && now < in_band->next_local_refresh) { + return; + } + + controller_ip = rconn_get_remote_ip(in_band->controller); + if (in_band->controller_ip && controller_ip != in_band->controller_ip) { + VLOG_DBG("controller IP address changed from "IP_FMT" to "IP_FMT, + IP_ARGS(&in_band->controller_ip), + IP_ARGS(&controller_ip)); + } + in_band->controller_ip = controller_ip; + + remote_mac = get_remote_mac(in_band); + local_mac = get_local_mac(in_band); + + if (local_mac) { + /* Allow DHCP requests to be sent from the local port. */ + memset(&flow, 0, sizeof flow); + flow.in_port = ODPP_LOCAL; + flow.dl_type = htons(ETH_TYPE_IP); + memcpy(flow.dl_src, local_mac, ETH_ADDR_LEN); + flow.nw_proto = IP_TYPE_UDP; + flow.tp_src = htons(DHCP_CLIENT_PORT); + flow.tp_dst = htons(DHCP_SERVER_PORT); - setup_flow(in_band, IBR_FROM_LOCAL_DHCP, &flow, - (OFPFW_IN_PORT | OFPFW_DL_TYPE | OFPFW_DL_SRC - | OFPFW_NW_PROTO | OFPFW_TP_SRC | OFPFW_TP_DST), - OFPP_NORMAL); ++ set_up_flow(in_band, IBR_FROM_LOCAL_DHCP, &flow, ++ (OFPFW_IN_PORT | OFPFW_DL_TYPE | OFPFW_DL_SRC ++ | OFPFW_NW_PROTO | OFPFW_TP_SRC | OFPFW_TP_DST), ++ OFPP_NORMAL); + + /* Allow the connection's interface to receive directed ARP traffic. */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_ARP); + memcpy(flow.dl_dst, local_mac, ETH_ADDR_LEN); + flow.nw_proto = ARP_OP_REPLY; - setup_flow(in_band, IBR_TO_LOCAL_ARP, &flow, - (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO), - OFPP_NORMAL); ++ set_up_flow(in_band, IBR_TO_LOCAL_ARP, &flow, ++ (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO), ++ OFPP_NORMAL); + + /* Allow the connection's interface to be the source of ARP traffic. */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_ARP); + memcpy(flow.dl_src, local_mac, ETH_ADDR_LEN); + flow.nw_proto = ARP_OP_REQUEST; - setup_flow(in_band, IBR_FROM_LOCAL_ARP, &flow, - (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO), - OFPP_NORMAL); ++ set_up_flow(in_band, IBR_FROM_LOCAL_ARP, &flow, ++ (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO), ++ OFPP_NORMAL); + } else { + drop_flow(in_band, IBR_TO_LOCAL_ARP); + drop_flow(in_band, IBR_FROM_LOCAL_ARP); + } + + if (remote_mac) { + /* Allow ARP replies to the remote side's MAC. */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_ARP); + memcpy(flow.dl_dst, remote_mac, ETH_ADDR_LEN); + flow.nw_proto = ARP_OP_REPLY; - setup_flow(in_band, IBR_TO_REMOTE_ARP, &flow, - (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO), - OFPP_NORMAL); ++ set_up_flow(in_band, IBR_TO_REMOTE_ARP, &flow, ++ (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO), ++ OFPP_NORMAL); + + /* Allow ARP requests from the remote side's MAC. */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_ARP); + memcpy(flow.dl_src, remote_mac, ETH_ADDR_LEN); + flow.nw_proto = ARP_OP_REQUEST; - setup_flow(in_band, IBR_FROM_REMOTE_ARP, &flow, - (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO), - OFPP_NORMAL); ++ set_up_flow(in_band, IBR_FROM_REMOTE_ARP, &flow, ++ (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO), ++ OFPP_NORMAL); + } else { + drop_flow(in_band, IBR_TO_REMOTE_ARP); + drop_flow(in_band, IBR_FROM_REMOTE_ARP); + } + + if (controller_ip) { + /* Allow ARP replies to the controller's IP. */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_ARP); + flow.nw_proto = ARP_OP_REPLY; + flow.nw_dst = controller_ip; - setup_flow(in_band, IBR_TO_CTL_ARP, &flow, - (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_DST_MASK), - OFPP_NORMAL); ++ set_up_flow(in_band, IBR_TO_CTL_ARP, &flow, ++ (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_DST_MASK), ++ OFPP_NORMAL); + + /* Allow ARP requests from the controller's IP. */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_ARP); + flow.nw_proto = ARP_OP_REQUEST; + flow.nw_src = controller_ip; - setup_flow(in_band, IBR_FROM_CTL_ARP, &flow, - (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_SRC_MASK), - OFPP_NORMAL); ++ set_up_flow(in_band, IBR_FROM_CTL_ARP, &flow, ++ (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_SRC_MASK), ++ OFPP_NORMAL); + + /* OpenFlow traffic to or from the controller. + * + * (A given field's value is completely ignored if it is wildcarded, + * which is why we can get away with using a single 'flow' in each + * case here.) */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_IP); + flow.nw_proto = IP_TYPE_TCP; + flow.nw_src = controller_ip; + flow.nw_dst = controller_ip; + flow.tp_src = htons(OFP_TCP_PORT); + flow.tp_dst = htons(OFP_TCP_PORT); - setup_flow(in_band, IBR_TO_CTL_OFP, &flow, - (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_DST_MASK - | OFPFW_TP_DST), OFPP_NORMAL); - setup_flow(in_band, IBR_FROM_CTL_OFP, &flow, - (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_SRC_MASK - | OFPFW_TP_SRC), OFPP_NORMAL); ++ set_up_flow(in_band, IBR_TO_CTL_OFP, &flow, ++ (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_DST_MASK ++ | OFPFW_TP_DST), OFPP_NORMAL); ++ set_up_flow(in_band, IBR_FROM_CTL_OFP, &flow, ++ (OFPFW_DL_TYPE | OFPFW_NW_PROTO | OFPFW_NW_SRC_MASK ++ | OFPFW_TP_SRC), OFPP_NORMAL); + } else { + drop_flow(in_band, IBR_TO_CTL_ARP); + drop_flow(in_band, IBR_FROM_CTL_ARP); + drop_flow(in_band, IBR_TO_CTL_OFP); + drop_flow(in_band, IBR_FROM_CTL_OFP); + } +} + +void +in_band_wait(struct in_band *in_band) +{ + time_t now = time_now(); + time_t wakeup + = MIN(in_band->next_remote_refresh, in_band->next_local_refresh); + if (wakeup > now) { + poll_timer_wait((wakeup - now) * 1000); + } else { + poll_immediate_wake(); + } +} + +void +in_band_flushed(struct in_band *in_band) +{ + int i; + + for (i = 0; i < N_IB_RULES; i++) { + in_band->rules[i].installed = false; + } +} + +int +in_band_create(struct ofproto *ofproto, struct dpif *dpif, + struct switch_status *ss, struct rconn *controller, + struct in_band **in_bandp) +{ + struct in_band *in_band; + char local_name[IF_NAMESIZE]; + struct netdev *local_netdev; + int error; + + error = dpif_port_get_name(dpif, ODPP_LOCAL, + local_name, sizeof local_name); + if (error) { + VLOG_ERR("failed to initialize in-band control: cannot get name " + "of datapath local port (%s)", strerror(error)); + return error; + } + + error = netdev_open(local_name, NETDEV_ETH_TYPE_NONE, &local_netdev); + if (error) { + VLOG_ERR("failed to initialize in-band control: cannot open " + "datapath local port %s (%s)", local_name, strerror(error)); + return error; + } + + in_band = xcalloc(1, sizeof *in_band); + in_band->ofproto = ofproto; + in_band->controller = controller; + in_band->ss_cat = switch_status_register(ss, "in-band", + in_band_status_cb, in_band); + in_band->local_netdev = local_netdev; + in_band->next_local_refresh = TIME_MIN; + in_band->remote_netdev = NULL; + in_band->next_remote_refresh = TIME_MIN; + + *in_bandp = in_band; + + return 0; +} + +void +in_band_destroy(struct in_band *in_band) +{ + if (in_band) { + switch_status_unregister(in_band->ss_cat); + netdev_close(in_band->local_netdev); + netdev_close(in_band->remote_netdev); + /* We don't own the rconn. */ + } +} + diff --cc ofproto/netflow.c index 7912b4b8,00000000..0505cd33 mode 100644,000000..100644 --- a/ofproto/netflow.c +++ b/ofproto/netflow.c @@@ -1,326 -1,0 +1,395 @@@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "netflow.h" +#include +#include +#include +#include +#include "cfg.h" +#include "flow.h" +#include "netflow.h" +#include "ofpbuf.h" +#include "ofproto.h" +#include "packets.h" +#include "socket-util.h" +#include "svec.h" +#include "timeval.h" +#include "util.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_netflow +#include "vlog.h" + +#define NETFLOW_V5_VERSION 5 + ++static const int ACTIVE_TIMEOUT_DEFAULT = 600; ++ +/* Every NetFlow v5 message contains the header that follows. This is + * followed by up to thirty records that describe a terminating flow. + * We only send a single record per NetFlow message. + */ +struct netflow_v5_header { + uint16_t version; /* NetFlow version is 5. */ + uint16_t count; /* Number of records in this message. */ + uint32_t sysuptime; /* System uptime in milliseconds. */ + uint32_t unix_secs; /* Number of seconds since Unix epoch. */ + uint32_t unix_nsecs; /* Number of residual nanoseconds + after epoch seconds. */ + uint32_t flow_seq; /* Number of flows since sending + messages began. */ + uint8_t engine_type; /* Engine type. */ + uint8_t engine_id; /* Engine id. */ + uint16_t sampling_interval; /* Set to zero. */ +}; +BUILD_ASSERT_DECL(sizeof(struct netflow_v5_header) == 24); + +/* A NetFlow v5 description of a terminating flow. It is preceded by a + * NetFlow v5 header. + */ +struct netflow_v5_record { + uint32_t src_addr; /* Source IP address. */ + uint32_t dst_addr; /* Destination IP address. */ + uint32_t nexthop; /* IP address of next hop. Set to 0. */ + uint16_t input; /* Input interface index. */ + uint16_t output; /* Output interface index. */ + uint32_t packet_count; /* Number of packets. */ + uint32_t byte_count; /* Number of bytes. */ + uint32_t init_time; /* Value of sysuptime on first packet. */ + uint32_t used_time; /* Value of sysuptime on last packet. */ + + /* The 'src_port' and 'dst_port' identify the source and destination + * port, respectively, for TCP and UDP. For ICMP, the high-order + * byte identifies the type and low-order byte identifies the code + * in the 'dst_port' field. */ + uint16_t src_port; + uint16_t dst_port; + + uint8_t pad1; + uint8_t tcp_flags; /* Union of seen TCP flags. */ + uint8_t ip_proto; /* IP protocol. */ + uint8_t ip_tos; /* IP TOS value. */ + uint16_t src_as; /* Source AS ID. Set to 0. */ + uint16_t dst_as; /* Destination AS ID. Set to 0. */ + uint8_t src_mask; /* Source mask bits. Set to 0. */ + uint8_t dst_mask; /* Destination mask bits. Set to 0. */ + uint8_t pad[2]; +}; +BUILD_ASSERT_DECL(sizeof(struct netflow_v5_record) == 48); + +struct netflow { + uint8_t engine_type; /* Value of engine_type to use. */ + uint8_t engine_id; /* Value of engine_id to use. */ + long long int boot_time; /* Time when netflow_create() was called. */ + int *fds; /* Sockets for NetFlow collectors. */ + size_t n_fds; /* Number of Netflow collectors. */ + bool add_id_to_iface; /* Put the 7 least signficiant bits of + * 'engine_id' into the most signficant + * bits of the interface fields. */ + uint32_t netflow_cnt; /* Flow sequence number for NetFlow. */ + struct ofpbuf packet; /* NetFlow packet being accumulated. */ ++ long long int active_timeout; /* Timeout for flows that are still active. */ ++ long long int reconfig_time; /* When we reconfigured the timeouts. */ +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + +static int +open_collector(char *dst) +{ + char *save_ptr = NULL; + const char *host_name; + const char *port_string; + struct sockaddr_in sin; + int retval; + int fd; + + /* Glibc 2.7 has a bug in strtok_r when compiling with optimization that + * can cause segfaults here: + * http://sources.redhat.com/bugzilla/show_bug.cgi?id=5614. + * Using "::" instead of the obvious ":" works around it. */ + host_name = strtok_r(dst, ":", &save_ptr); + port_string = strtok_r(NULL, ":", &save_ptr); + if (!host_name) { + ovs_error(0, "%s: bad peer name format", dst); + return -EAFNOSUPPORT; + } + if (!port_string) { + ovs_error(0, "%s: bad port format", dst); + return -EAFNOSUPPORT; + } + + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + if (lookup_ip(host_name, &sin.sin_addr)) { + return -ENOENT; + } + sin.sin_port = htons(atoi(port_string)); + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) { + VLOG_ERR("%s: socket: %s", dst, strerror(errno)); + return -errno; + } + + retval = set_nonblocking(fd); + if (retval) { + close(fd); + return -retval; + } + + retval = connect(fd, (struct sockaddr *) &sin, sizeof sin); + if (retval < 0) { + int error = errno; + VLOG_ERR("%s: connect: %s", dst, strerror(error)); + close(fd); + return -error; + } + + return fd; +} + +void - netflow_expire(struct netflow *nf, const struct ofexpired *expired) ++netflow_expire(struct netflow *nf, struct netflow_flow *nf_flow, ++ struct ofexpired *expired) +{ + struct netflow_v5_header *nf_hdr; + struct netflow_v5_record *nf_rec; + struct timeval now; + - /* NetFlow only reports on IP packets. */ - if (expired->flow.dl_type != htons(ETH_TYPE_IP)) { ++ nf_flow->last_expired += nf->active_timeout; ++ ++ /* NetFlow only reports on IP packets and we should only report flows ++ * that actually have traffic. */ ++ if (expired->flow.dl_type != htons(ETH_TYPE_IP) || ++ expired->packet_count - nf_flow->packet_count_off == 0) { + return; + } + + time_timeval(&now); + + if (!nf->packet.size) { + nf_hdr = ofpbuf_put_zeros(&nf->packet, sizeof *nf_hdr); + nf_hdr->version = htons(NETFLOW_V5_VERSION); + nf_hdr->count = htons(0); + nf_hdr->sysuptime = htonl(time_msec() - nf->boot_time); + nf_hdr->unix_secs = htonl(now.tv_sec); + nf_hdr->unix_nsecs = htonl(now.tv_usec * 1000); + nf_hdr->flow_seq = htonl(nf->netflow_cnt++); + nf_hdr->engine_type = nf->engine_type; + nf_hdr->engine_id = nf->engine_id; + nf_hdr->sampling_interval = htons(0); + } + + nf_hdr = nf->packet.data; + nf_hdr->count = htons(ntohs(nf_hdr->count) + 1); + + nf_rec = ofpbuf_put_zeros(&nf->packet, sizeof *nf_rec); + nf_rec->src_addr = expired->flow.nw_src; + nf_rec->dst_addr = expired->flow.nw_dst; + nf_rec->nexthop = htons(0); + if (nf->add_id_to_iface) { + uint16_t iface = (nf->engine_id & 0x7f) << 9; + nf_rec->input = htons(iface | (expired->flow.in_port & 0x1ff)); - nf_rec->output = htons(iface); ++ nf_rec->output = htons(iface | (nf_flow->output_iface & 0x1ff)); + } else { + nf_rec->input = htons(expired->flow.in_port); - nf_rec->output = htons(0); ++ nf_rec->output = htons(nf_flow->output_iface); + } - nf_rec->packet_count = htonl(MIN(expired->packet_count, UINT32_MAX)); - nf_rec->byte_count = htonl(MIN(expired->byte_count, UINT32_MAX)); - nf_rec->init_time = htonl(expired->created - nf->boot_time); - nf_rec->used_time = htonl(MAX(expired->created, expired->used) ++ nf_rec->packet_count = htonl(MIN(expired->packet_count - ++ nf_flow->packet_count_off, UINT32_MAX)); ++ nf_rec->byte_count = htonl(MIN(expired->byte_count - ++ nf_flow->byte_count_off, UINT32_MAX)); ++ nf_rec->init_time = htonl(nf_flow->created - nf->boot_time); ++ nf_rec->used_time = htonl(MAX(nf_flow->created, expired->used) + - nf->boot_time); + if (expired->flow.nw_proto == IP_TYPE_ICMP) { + /* In NetFlow, the ICMP type and code are concatenated and + * placed in the 'dst_port' field. */ + uint8_t type = ntohs(expired->flow.tp_src); + uint8_t code = ntohs(expired->flow.tp_dst); + nf_rec->src_port = htons(0); + nf_rec->dst_port = htons((type << 8) | code); + } else { + nf_rec->src_port = expired->flow.tp_src; + nf_rec->dst_port = expired->flow.tp_dst; + } - nf_rec->tcp_flags = expired->tcp_flags; ++ nf_rec->tcp_flags = nf_flow->tcp_flags; + nf_rec->ip_proto = expired->flow.nw_proto; - nf_rec->ip_tos = expired->ip_tos; ++ nf_rec->ip_tos = nf_flow->ip_tos; ++ ++ /* Update flow tracking data. */ ++ nf_flow->created = 0; ++ nf_flow->packet_count_off = expired->packet_count; ++ nf_flow->byte_count_off = expired->byte_count; ++ nf_flow->tcp_flags = 0; + + /* NetFlow messages are limited to 30 records. */ + if (ntohs(nf_hdr->count) >= 30) { + netflow_run(nf); + } +} + +void +netflow_run(struct netflow *nf) +{ + size_t i; + + if (!nf->packet.size) { + return; + } + + for (i = 0; i < nf->n_fds; i++) { + if (send(nf->fds[i], nf->packet.data, nf->packet.size, 0) == -1) { + VLOG_WARN_RL(&rl, "netflow message send failed: %s", + strerror(errno)); + } + } + nf->packet.size = 0; +} + +static void +clear_collectors(struct netflow *nf) +{ + size_t i; + + for (i = 0; i < nf->n_fds; i++) { + close(nf->fds[i]); + } + free(nf->fds); + nf->fds = NULL; + nf->n_fds = 0; +} + +int - netflow_set_collectors(struct netflow *nf, const struct svec *collectors_) ++netflow_set_options(struct netflow *nf, ++ const struct netflow_options *nf_options) +{ + struct svec collectors; + int error = 0; + size_t i; ++ long long int old_timeout; ++ ++ nf->engine_type = nf_options->engine_type; ++ nf->engine_id = nf_options->engine_id; ++ nf->add_id_to_iface = nf_options->add_id_to_iface; + + clear_collectors(nf); + - svec_clone(&collectors, collectors_); ++ svec_clone(&collectors, &nf_options->collectors); + svec_sort_unique(&collectors); + + nf->fds = xmalloc(sizeof *nf->fds * collectors.n); + for (i = 0; i < collectors.n; i++) { + const char *name = collectors.names[i]; + char *tmpname = xstrdup(name); + int fd = open_collector(tmpname); + free(tmpname); + if (fd >= 0) { + nf->fds[nf->n_fds++] = fd; + } else { + VLOG_WARN("couldn't open connection to collector (%s), " + "ignoring %s\n", strerror(-fd), name); + if (!error) { + error = -fd; + } + } + } + + svec_destroy(&collectors); - return error; - } + - void - netflow_set_engine(struct netflow *nf, uint8_t engine_type, - uint8_t engine_id, bool add_id_to_iface) - { - nf->engine_type = engine_type; - nf->engine_id = engine_id; - nf->add_id_to_iface = add_id_to_iface; ++ old_timeout = nf->active_timeout; ++ if (nf_options->active_timeout != -1) { ++ nf->active_timeout = nf_options->active_timeout; ++ } else { ++ nf->active_timeout = ACTIVE_TIMEOUT_DEFAULT; ++ } ++ nf->active_timeout *= 1000; ++ if (old_timeout != nf->active_timeout) { ++ nf->reconfig_time = time_msec(); ++ } ++ ++ return error; +} + +struct netflow * +netflow_create(void) +{ + struct netflow *nf = xmalloc(sizeof *nf); + nf->engine_type = 0; + nf->engine_id = 0; + nf->boot_time = time_msec(); + nf->fds = NULL; + nf->n_fds = 0; + nf->add_id_to_iface = false; + nf->netflow_cnt = 0; + ofpbuf_init(&nf->packet, 1500); + return nf; +} + +void +netflow_destroy(struct netflow *nf) +{ + if (nf) { + ofpbuf_uninit(&nf->packet); + clear_collectors(nf); + free(nf); + } +} ++ ++void ++netflow_flow_clear(struct netflow_flow *nf_flow) ++{ ++ uint16_t output_iface = nf_flow->output_iface; ++ ++ memset(nf_flow, 0, sizeof *nf_flow); ++ nf_flow->output_iface = output_iface; ++} ++ ++void ++netflow_flow_update_time(struct netflow *nf, struct netflow_flow *nf_flow, ++ long long int used) ++{ ++ if (!nf_flow->created) { ++ nf_flow->created = used; ++ } ++ ++ if (!nf || !nf->active_timeout || !nf_flow->last_expired || ++ nf->reconfig_time > nf_flow->last_expired) { ++ /* Keep the time updated to prevent a flood of expiration in ++ * the future. */ ++ nf_flow->last_expired = time_msec(); ++ } ++} ++ ++void ++netflow_flow_update_flags(struct netflow_flow *nf_flow, uint8_t ip_tos, ++ uint8_t tcp_flags) ++{ ++ nf_flow->ip_tos = ip_tos; ++ nf_flow->tcp_flags |= tcp_flags; ++} ++ ++bool ++netflow_active_timeout_expired(struct netflow *nf, struct netflow_flow *nf_flow) ++{ ++ if (nf->active_timeout) { ++ return time_msec() > nf_flow->last_expired + nf->active_timeout; ++ } ++ ++ return false; ++} diff --cc ofproto/netflow.h index 13be90b4,00000000..cc7b9605 mode 100644,000000..100644 --- a/ofproto/netflow.h +++ b/ofproto/netflow.h @@@ -1,33 -1,0 +1,65 @@@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NETFLOW_H +#define NETFLOW_H 1 + +#include "flow.h" ++#include "svec.h" + +struct ofexpired; - struct svec; ++ ++struct netflow_options { ++ struct svec collectors; ++ uint8_t engine_type; ++ uint8_t engine_id; ++ int active_timeout; ++ bool add_id_to_iface; ++}; ++ ++enum netflow_output_ports { ++ NF_OUT_FLOOD = UINT16_MAX, ++ NF_OUT_MULTI = UINT16_MAX - 1, ++ NF_OUT_DROP = UINT16_MAX - 2 ++}; ++ ++struct netflow_flow { ++ long long int last_expired; /* Time this flow last timed out. */ ++ long long int created; /* Time flow was created since time out. */ ++ ++ uint64_t packet_count_off; /* Packet count at last time out. */ ++ uint64_t byte_count_off; /* Byte count at last time out. */ ++ ++ uint16_t output_iface; /* Output interface index. */ ++ uint8_t ip_tos; /* Last-seen IP type-of-service. */ ++ uint8_t tcp_flags; /* Bitwise-OR of all TCP flags seen. */ ++}; + +struct netflow *netflow_create(void); +void netflow_destroy(struct netflow *); - int netflow_set_collectors(struct netflow *, const struct svec *collectors); - void netflow_set_engine(struct netflow *nf, uint8_t engine_type, - uint8_t engine_id, bool add_id_to_iface); - void netflow_expire(struct netflow *, const struct ofexpired *); ++int netflow_set_options(struct netflow *, const struct netflow_options *); ++void netflow_expire(struct netflow *, struct netflow_flow *, ++ struct ofexpired *); +void netflow_run(struct netflow *); + ++void netflow_flow_clear(struct netflow_flow *); ++void netflow_flow_update_time(struct netflow *, struct netflow_flow *, ++ long long int used); ++void netflow_flow_update_flags(struct netflow_flow *, uint8_t ip_tos, ++ uint8_t tcp_flags); ++bool netflow_active_timeout_expired(struct netflow *, struct netflow_flow *); ++ +#endif /* netflow.h */ diff --cc ofproto/ofproto.c index eb8a7a91,00000000..b49ea49f mode 100644,000000..100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@@ -1,3445 -1,0 +1,3524 @@@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "ofproto.h" +#include +#include +#include +#include +#include +#include +#include "classifier.h" +#include "coverage.h" +#include "discovery.h" +#include "dpif.h" +#include "dynamic-string.h" +#include "executer.h" +#include "fail-open.h" +#include "in-band.h" +#include "mac-learning.h" +#include "netdev.h" +#include "netflow.h" +#include "odp-util.h" +#include "ofp-print.h" +#include "ofpbuf.h" +#include "openflow/nicira-ext.h" +#include "openflow/openflow.h" +#include "openflow/openflow-mgmt.h" +#include "openvswitch/datapath-protocol.h" +#include "packets.h" +#include "pinsched.h" +#include "pktbuf.h" +#include "poll-loop.h" +#include "port-array.h" +#include "rconn.h" +#include "shash.h" +#include "status.h" +#include "stp.h" +#include "svec.h" +#include "tag.h" +#include "timeval.h" +#include "unixctl.h" +#include "vconn.h" +#include "vconn-ssl.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_ofproto +#include "vlog.h" + +enum { + DP_GROUP_FLOOD = 0, + DP_GROUP_ALL = 1 +}; + +enum { + TABLEID_HASH = 0, + TABLEID_CLASSIFIER = 1 +}; + +struct ofport { + struct netdev *netdev; + struct ofp_phy_port opp; /* In host byte order. */ +}; + +static void ofport_free(struct ofport *); +static void hton_ofp_phy_port(struct ofp_phy_port *); + +static int xlate_actions(const union ofp_action *in, size_t n_in, + const flow_t *flow, struct ofproto *ofproto, + const struct ofpbuf *packet, + struct odp_actions *out, tag_type *tags, - bool *may_setup_flow); ++ bool *may_set_up_flow, uint16_t *nf_output_iface); + +struct rule { + struct cls_rule cr; + + uint16_t idle_timeout; /* In seconds from time of last use. */ + uint16_t hard_timeout; /* In seconds from time of creation. */ + long long int used; /* Last-used time (0 if never used). */ + long long int created; /* Creation time. */ + uint64_t packet_count; /* Number of packets received. */ + uint64_t byte_count; /* Number of bytes received. */ + uint64_t accounted_bytes; /* Number of bytes passed to account_cb. */ - uint8_t tcp_flags; /* Bitwise-OR of all TCP flags seen. */ - uint8_t ip_tos; /* Last-seen IP type-of-service. */ + tag_type tags; /* Tags (set only by hooks). */ ++ struct netflow_flow nf_flow; /* Per-flow NetFlow tracking data. */ + + /* If 'super' is non-NULL, this rule is a subrule, that is, it is an + * exact-match rule (having cr.wc.wildcards of 0) generated from the + * wildcard rule 'super'. In this case, 'list' is an element of the + * super-rule's list. + * + * If 'super' is NULL, this rule is a super-rule, and 'list' is the head of + * a list of subrules. A super-rule with no wildcards (where + * cr.wc.wildcards is 0) will never have any subrules. */ + struct rule *super; + struct list list; + + /* OpenFlow actions. + * + * A subrule has no actions (it uses the super-rule's actions). */ + int n_actions; + union ofp_action *actions; + + /* Datapath actions. + * + * A super-rule with wildcard fields never has ODP actions (since the + * datapath only supports exact-match flows). */ + bool installed; /* Installed in datapath? */ + bool may_install; /* True ordinarily; false if actions must + * be reassessed for every packet. */ + int n_odp_actions; + union odp_action *odp_actions; +}; + +static inline bool +rule_is_hidden(const struct rule *rule) +{ + /* Subrules are merely an implementation detail, so hide them from the + * controller. */ + if (rule->super != NULL) { + return true; + } + + /* Rules with priority higher than UINT16_MAX are set up by ofproto itself + * (e.g. by in-band control) and are intentionally hidden from the + * controller. */ + if (rule->cr.priority > UINT16_MAX) { + return true; + } + + return false; +} + - static struct rule *rule_create(struct rule *super, const union ofp_action *, - size_t n_actions, uint16_t idle_timeout, - uint16_t hard_timeout); ++static struct rule *rule_create(struct ofproto *, struct rule *super, ++ const union ofp_action *, size_t n_actions, ++ uint16_t idle_timeout, uint16_t hard_timeout); +static void rule_free(struct rule *); +static void rule_destroy(struct ofproto *, struct rule *); +static struct rule *rule_from_cls_rule(const struct cls_rule *); +static void rule_insert(struct ofproto *, struct rule *, + struct ofpbuf *packet, uint16_t in_port); +static void rule_remove(struct ofproto *, struct rule *); +static bool rule_make_actions(struct ofproto *, struct rule *, + const struct ofpbuf *packet); +static void rule_install(struct ofproto *, struct rule *, + struct rule *displaced_rule); +static void rule_uninstall(struct ofproto *, struct rule *); +static void rule_post_uninstall(struct ofproto *, struct rule *); + +struct ofconn { + struct list node; + struct rconn *rconn; + struct pktbuf *pktbuf; + bool send_flow_exp; + int miss_send_len; + + struct rconn_packet_counter *packet_in_counter; + + /* Number of OpenFlow messages queued as replies to OpenFlow requests, and + * the maximum number before we stop reading OpenFlow requests. */ +#define OFCONN_REPLY_MAX 100 + struct rconn_packet_counter *reply_counter; +}; + +static struct ofconn *ofconn_create(struct ofproto *, struct rconn *); +static void ofconn_destroy(struct ofconn *, struct ofproto *); +static void ofconn_run(struct ofconn *, struct ofproto *); +static void ofconn_wait(struct ofconn *); +static void queue_tx(struct ofpbuf *msg, const struct ofconn *ofconn, + struct rconn_packet_counter *counter); + +struct ofproto { + /* Settings. */ + uint64_t datapath_id; /* Datapath ID. */ + uint64_t fallback_dpid; /* Datapath ID if no better choice found. */ + uint64_t mgmt_id; /* Management channel identifier. */ + char *manufacturer; /* Manufacturer. */ + char *hardware; /* Hardware. */ + char *software; /* Software version. */ + char *serial; /* Serial number. */ + + /* Datapath. */ + struct dpif *dpif; + struct netdev_monitor *netdev_monitor; + struct port_array ports; /* Index is ODP port nr; ofport->opp.port_no is + * OFP port nr. */ + struct shash port_by_name; + uint32_t max_ports; + + /* Configuration. */ + struct switch_status *switch_status; + struct status_category *ss_cat; + struct in_band *in_band; + struct discovery *discovery; + struct fail_open *fail_open; + struct pinsched *miss_sched, *action_sched; + struct executer *executer; + struct netflow *netflow; + + /* Flow table. */ + struct classifier cls; + bool need_revalidate; + long long int next_expiration; + struct tag_set revalidate_set; + + /* OpenFlow connections. */ + struct list all_conns; + struct ofconn *controller; + struct pvconn **listeners; + size_t n_listeners; + struct pvconn **snoops; + size_t n_snoops; + + /* Hooks for ovs-vswitchd. */ + const struct ofhooks *ofhooks; + void *aux; + + /* Used by default ofhooks. */ + struct mac_learning *ml; +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + +static const struct ofhooks default_ofhooks; + +static uint64_t pick_datapath_id(const struct ofproto *); +static uint64_t pick_fallback_dpid(void); +static void send_packet_in_miss(struct ofpbuf *, void *ofproto); +static void send_packet_in_action(struct ofpbuf *, void *ofproto); +static void update_used(struct ofproto *); - static void update_stats(struct rule *, const struct odp_flow_stats *); ++static void update_stats(struct ofproto *, struct rule *, ++ const struct odp_flow_stats *); +static void expire_rule(struct cls_rule *, void *ofproto); ++static void active_timeout(struct ofproto *ofproto, struct rule *rule); +static bool revalidate_rule(struct ofproto *p, struct rule *rule); +static void revalidate_cb(struct cls_rule *rule_, void *p_); + +static void handle_odp_msg(struct ofproto *, struct ofpbuf *); + +static void handle_openflow(struct ofconn *, struct ofproto *, + struct ofpbuf *); + +static void refresh_port_group(struct ofproto *, unsigned int group); +static void update_port(struct ofproto *, const char *devname); +static int init_ports(struct ofproto *); +static void reinit_ports(struct ofproto *); + +int +ofproto_create(const char *datapath, const struct ofhooks *ofhooks, void *aux, + struct ofproto **ofprotop) +{ + struct odp_stats stats; + struct ofproto *p; + struct dpif *dpif; + int error; + + *ofprotop = NULL; + + /* Connect to datapath and start listening for messages. */ + error = dpif_open(datapath, &dpif); + if (error) { + VLOG_ERR("failed to open datapath %s: %s", datapath, strerror(error)); + return error; + } + error = dpif_get_dp_stats(dpif, &stats); + if (error) { + VLOG_ERR("failed to obtain stats for datapath %s: %s", + datapath, strerror(error)); + dpif_close(dpif); + return error; + } + error = dpif_recv_set_mask(dpif, ODPL_MISS | ODPL_ACTION); + if (error) { + VLOG_ERR("failed to listen on datapath %s: %s", + datapath, strerror(error)); + dpif_close(dpif); + return error; + } + dpif_flow_flush(dpif); + dpif_recv_purge(dpif); + + /* Initialize settings. */ + p = xcalloc(1, sizeof *p); + p->fallback_dpid = pick_fallback_dpid(); + p->datapath_id = p->fallback_dpid; + p->manufacturer = xstrdup("Nicira Networks, Inc."); + p->hardware = xstrdup("Reference Implementation"); + p->software = xstrdup(VERSION BUILDNR); + p->serial = xstrdup("None"); + + /* Initialize datapath. */ + p->dpif = dpif; + p->netdev_monitor = netdev_monitor_create(); + port_array_init(&p->ports); + shash_init(&p->port_by_name); + p->max_ports = stats.max_ports; + + /* Initialize submodules. */ + p->switch_status = switch_status_create(p); + p->in_band = NULL; + p->discovery = NULL; + p->fail_open = NULL; + p->miss_sched = p->action_sched = NULL; + p->executer = NULL; + p->netflow = NULL; + + /* Initialize flow table. */ + classifier_init(&p->cls); + p->need_revalidate = false; + p->next_expiration = time_msec() + 1000; + tag_set_init(&p->revalidate_set); + + /* Initialize OpenFlow connections. */ + list_init(&p->all_conns); + p->controller = ofconn_create(p, rconn_create(5, 8)); + p->controller->pktbuf = pktbuf_create(); + p->controller->miss_send_len = OFP_DEFAULT_MISS_SEND_LEN; + p->listeners = NULL; + p->n_listeners = 0; + p->snoops = NULL; + p->n_snoops = 0; + + /* Initialize hooks. */ + if (ofhooks) { + p->ofhooks = ofhooks; + p->aux = aux; + p->ml = NULL; + } else { + p->ofhooks = &default_ofhooks; + p->aux = p; + p->ml = mac_learning_create(); + } + + /* Register switch status category. */ + p->ss_cat = switch_status_register(p->switch_status, "remote", + rconn_status_cb, p->controller->rconn); + + /* Almost done... */ + error = init_ports(p); + if (error) { + ofproto_destroy(p); + return error; + } + + /* Pick final datapath ID. */ + p->datapath_id = pick_datapath_id(p); + VLOG_INFO("using datapath ID %012"PRIx64, p->datapath_id); + + *ofprotop = p; + return 0; +} + +void +ofproto_set_datapath_id(struct ofproto *p, uint64_t datapath_id) +{ + uint64_t old_dpid = p->datapath_id; + p->datapath_id = datapath_id ? datapath_id : pick_datapath_id(p); + if (p->datapath_id != old_dpid) { + VLOG_INFO("datapath ID changed to %012"PRIx64, p->datapath_id); + rconn_reconnect(p->controller->rconn); + } +} + +void +ofproto_set_mgmt_id(struct ofproto *p, uint64_t mgmt_id) +{ + p->mgmt_id = mgmt_id; +} + +void +ofproto_set_probe_interval(struct ofproto *p, int probe_interval) +{ + probe_interval = probe_interval ? MAX(probe_interval, 5) : 0; + rconn_set_probe_interval(p->controller->rconn, probe_interval); + if (p->fail_open) { + int trigger_duration = probe_interval ? probe_interval * 3 : 15; + fail_open_set_trigger_duration(p->fail_open, trigger_duration); + } +} + +void +ofproto_set_max_backoff(struct ofproto *p, int max_backoff) +{ + rconn_set_max_backoff(p->controller->rconn, max_backoff); +} + +void +ofproto_set_desc(struct ofproto *p, + const char *manufacturer, const char *hardware, + const char *software, const char *serial) +{ + if (manufacturer) { + free(p->manufacturer); + p->manufacturer = xstrdup(manufacturer); + } + if (hardware) { + free(p->hardware); + p->hardware = xstrdup(hardware); + } + if (software) { + free(p->software); + p->software = xstrdup(software); + } + if (serial) { + free(p->serial); + p->serial = xstrdup(serial); + } +} + +int +ofproto_set_in_band(struct ofproto *p, bool in_band) +{ + if (in_band != (p->in_band != NULL)) { + if (in_band) { + return in_band_create(p, p->dpif, p->switch_status, + p->controller->rconn, &p->in_band); + } else { + ofproto_set_discovery(p, false, NULL, true); + in_band_destroy(p->in_band); + p->in_band = NULL; + } + rconn_reconnect(p->controller->rconn); + } + return 0; +} + +int +ofproto_set_discovery(struct ofproto *p, bool discovery, + const char *re, bool update_resolv_conf) +{ + if (discovery != (p->discovery != NULL)) { + if (discovery) { + int error = ofproto_set_in_band(p, true); + if (error) { + return error; + } + error = discovery_create(re, update_resolv_conf, + p->dpif, p->switch_status, + &p->discovery); + if (error) { + return error; + } + } else { + discovery_destroy(p->discovery); + p->discovery = NULL; + } + rconn_disconnect(p->controller->rconn); + } else if (discovery) { + discovery_set_update_resolv_conf(p->discovery, update_resolv_conf); + return discovery_set_accept_controller_re(p->discovery, re); + } + return 0; +} + +int +ofproto_set_controller(struct ofproto *ofproto, const char *controller) +{ + if (ofproto->discovery) { + return EINVAL; + } else if (controller) { + if (strcmp(rconn_get_name(ofproto->controller->rconn), controller)) { + return rconn_connect(ofproto->controller->rconn, controller); + } else { + return 0; + } + } else { + rconn_disconnect(ofproto->controller->rconn); + return 0; + } +} + +static int +set_pvconns(struct pvconn ***pvconnsp, size_t *n_pvconnsp, + const struct svec *svec) +{ + struct pvconn **pvconns = *pvconnsp; + size_t n_pvconns = *n_pvconnsp; + int retval = 0; + size_t i; + + for (i = 0; i < n_pvconns; i++) { + pvconn_close(pvconns[i]); + } + free(pvconns); + + pvconns = xmalloc(svec->n * sizeof *pvconns); + n_pvconns = 0; + for (i = 0; i < svec->n; i++) { + const char *name = svec->names[i]; + struct pvconn *pvconn; + int error; + + error = pvconn_open(name, &pvconn); + if (!error) { + pvconns[n_pvconns++] = pvconn; + } else { + VLOG_ERR("failed to listen on %s: %s", name, strerror(error)); + if (!retval) { + retval = error; + } + } + } + + *pvconnsp = pvconns; + *n_pvconnsp = n_pvconns; + + return retval; +} + +int +ofproto_set_listeners(struct ofproto *ofproto, const struct svec *listeners) +{ + return set_pvconns(&ofproto->listeners, &ofproto->n_listeners, listeners); +} + +int +ofproto_set_snoops(struct ofproto *ofproto, const struct svec *snoops) +{ + return set_pvconns(&ofproto->snoops, &ofproto->n_snoops, snoops); +} + +int - ofproto_set_netflow(struct ofproto *ofproto, const struct svec *collectors, - uint8_t engine_type, uint8_t engine_id, bool add_id_to_iface) ++ofproto_set_netflow(struct ofproto *ofproto, ++ const struct netflow_options *nf_options) +{ - if (collectors && collectors->n) { ++ if (nf_options->collectors.n) { + if (!ofproto->netflow) { + ofproto->netflow = netflow_create(); + } - netflow_set_engine(ofproto->netflow, engine_type, engine_id, - add_id_to_iface); - return netflow_set_collectors(ofproto->netflow, collectors); ++ return netflow_set_options(ofproto->netflow, nf_options); + } else { + netflow_destroy(ofproto->netflow); + ofproto->netflow = NULL; + return 0; + } +} + +void +ofproto_set_failure(struct ofproto *ofproto, bool fail_open) +{ + if (fail_open) { + struct rconn *rconn = ofproto->controller->rconn; + int trigger_duration = rconn_get_probe_interval(rconn) * 3; + if (!ofproto->fail_open) { + ofproto->fail_open = fail_open_create(ofproto, trigger_duration, + ofproto->switch_status, + rconn); + } else { + fail_open_set_trigger_duration(ofproto->fail_open, + trigger_duration); + } + } else { + fail_open_destroy(ofproto->fail_open); + ofproto->fail_open = NULL; + } +} + +void +ofproto_set_rate_limit(struct ofproto *ofproto, + int rate_limit, int burst_limit) +{ + if (rate_limit > 0) { + if (!ofproto->miss_sched) { + ofproto->miss_sched = pinsched_create(rate_limit, burst_limit, + ofproto->switch_status); + ofproto->action_sched = pinsched_create(rate_limit, burst_limit, + NULL); + } else { + pinsched_set_limits(ofproto->miss_sched, rate_limit, burst_limit); + pinsched_set_limits(ofproto->action_sched, + rate_limit, burst_limit); + } + } else { + pinsched_destroy(ofproto->miss_sched); + ofproto->miss_sched = NULL; + pinsched_destroy(ofproto->action_sched); + ofproto->action_sched = NULL; + } +} + +int +ofproto_set_stp(struct ofproto *ofproto UNUSED, bool enable_stp) +{ + /* XXX */ + if (enable_stp) { + VLOG_WARN("STP is not yet implemented"); + return EINVAL; + } else { + return 0; + } +} + +int +ofproto_set_remote_execution(struct ofproto *ofproto, const char *command_acl, + const char *command_dir) +{ + if (command_acl) { + if (!ofproto->executer) { + return executer_create(command_acl, command_dir, + &ofproto->executer); + } else { + executer_set_acl(ofproto->executer, command_acl, command_dir); + } + } else { + executer_destroy(ofproto->executer); + ofproto->executer = NULL; + } + return 0; +} + +uint64_t +ofproto_get_datapath_id(const struct ofproto *ofproto) +{ + return ofproto->datapath_id; +} + +uint64_t +ofproto_get_mgmt_id(const struct ofproto *ofproto) +{ + return ofproto->mgmt_id; +} + +int +ofproto_get_probe_interval(const struct ofproto *ofproto) +{ + return rconn_get_probe_interval(ofproto->controller->rconn); +} + +int +ofproto_get_max_backoff(const struct ofproto *ofproto) +{ + return rconn_get_max_backoff(ofproto->controller->rconn); +} + +bool +ofproto_get_in_band(const struct ofproto *ofproto) +{ + return ofproto->in_band != NULL; +} + +bool +ofproto_get_discovery(const struct ofproto *ofproto) +{ + return ofproto->discovery != NULL; +} + +const char * +ofproto_get_controller(const struct ofproto *ofproto) +{ + return rconn_get_name(ofproto->controller->rconn); +} + +void +ofproto_get_listeners(const struct ofproto *ofproto, struct svec *listeners) +{ + size_t i; + + for (i = 0; i < ofproto->n_listeners; i++) { + svec_add(listeners, pvconn_get_name(ofproto->listeners[i])); + } +} + +void +ofproto_get_snoops(const struct ofproto *ofproto, struct svec *snoops) +{ + size_t i; + + for (i = 0; i < ofproto->n_snoops; i++) { + svec_add(snoops, pvconn_get_name(ofproto->snoops[i])); + } +} + +void +ofproto_destroy(struct ofproto *p) +{ + struct ofconn *ofconn, *next_ofconn; + struct ofport *ofport; + unsigned int port_no; + size_t i; + + if (!p) { + return; + } + + ofproto_flush_flows(p); + classifier_destroy(&p->cls); + + LIST_FOR_EACH_SAFE (ofconn, next_ofconn, struct ofconn, node, + &p->all_conns) { + ofconn_destroy(ofconn, p); + } + + dpif_close(p->dpif); + netdev_monitor_destroy(p->netdev_monitor); + PORT_ARRAY_FOR_EACH (ofport, &p->ports, port_no) { + ofport_free(ofport); + } + shash_destroy(&p->port_by_name); + + switch_status_destroy(p->switch_status); + in_band_destroy(p->in_band); + discovery_destroy(p->discovery); + fail_open_destroy(p->fail_open); + pinsched_destroy(p->miss_sched); + pinsched_destroy(p->action_sched); + executer_destroy(p->executer); + netflow_destroy(p->netflow); + + switch_status_unregister(p->ss_cat); + + for (i = 0; i < p->n_listeners; i++) { + pvconn_close(p->listeners[i]); + } + free(p->listeners); + + for (i = 0; i < p->n_snoops; i++) { + pvconn_close(p->snoops[i]); + } + free(p->snoops); + + mac_learning_destroy(p->ml); + + free(p); +} + +int +ofproto_run(struct ofproto *p) +{ + int error = ofproto_run1(p); + if (!error) { + error = ofproto_run2(p, false); + } + return error; +} + +static void +process_port_change(struct ofproto *ofproto, int error, char *devname) +{ + if (error == ENOBUFS) { + reinit_ports(ofproto); + } else if (!error) { + update_port(ofproto, devname); + free(devname); + } +} + +int +ofproto_run1(struct ofproto *p) +{ + struct ofconn *ofconn, *next_ofconn; + char *devname; + int error; + int i; + + for (i = 0; i < 50; i++) { + struct ofpbuf *buf; + int error; + + error = dpif_recv(p->dpif, &buf); + if (error) { + if (error == ENODEV) { + /* Someone destroyed the datapath behind our back. The caller + * better destroy us and give up, because we're just going to + * spin from here on out. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "%s: datapath was destroyed externally", + dpif_name(p->dpif)); + return ENODEV; + } + break; + } + + handle_odp_msg(p, buf); + } + + while ((error = dpif_port_poll(p->dpif, &devname)) != EAGAIN) { + process_port_change(p, error, devname); + } + while ((error = netdev_monitor_poll(p->netdev_monitor, + &devname)) != EAGAIN) { + process_port_change(p, error, devname); + } + + if (p->in_band) { + in_band_run(p->in_band); + } + if (p->discovery) { + char *controller_name; + if (rconn_is_connectivity_questionable(p->controller->rconn)) { + discovery_question_connectivity(p->discovery); + } + if (discovery_run(p->discovery, &controller_name)) { + if (controller_name) { + rconn_connect(p->controller->rconn, controller_name); + } else { + rconn_disconnect(p->controller->rconn); + } + } + } + pinsched_run(p->miss_sched, send_packet_in_miss, p); + pinsched_run(p->action_sched, send_packet_in_action, p); + if (p->executer) { + executer_run(p->executer); + } + + LIST_FOR_EACH_SAFE (ofconn, next_ofconn, struct ofconn, node, + &p->all_conns) { + ofconn_run(ofconn, p); + } + + /* Fail-open maintenance. Do this after processing the ofconns since + * fail-open checks the status of the controller rconn. */ + if (p->fail_open) { + fail_open_run(p->fail_open); + } + + for (i = 0; i < p->n_listeners; i++) { + struct vconn *vconn; + int retval; + + retval = pvconn_accept(p->listeners[i], OFP_VERSION, &vconn); + if (!retval) { + ofconn_create(p, rconn_new_from_vconn("passive", vconn)); + } else if (retval != EAGAIN) { + VLOG_WARN_RL(&rl, "accept failed (%s)", strerror(retval)); + } + } + + for (i = 0; i < p->n_snoops; i++) { + struct vconn *vconn; + int retval; + + retval = pvconn_accept(p->snoops[i], OFP_VERSION, &vconn); + if (!retval) { + rconn_add_monitor(p->controller->rconn, vconn); + } else if (retval != EAGAIN) { + VLOG_WARN_RL(&rl, "accept failed (%s)", strerror(retval)); + } + } + + if (time_msec() >= p->next_expiration) { + COVERAGE_INC(ofproto_expiration); + p->next_expiration = time_msec() + 1000; + update_used(p); + + classifier_for_each(&p->cls, CLS_INC_ALL, expire_rule, p); + + /* Let the hook know that we're at a stable point: all outstanding data + * in existing flows has been accounted to the account_cb. Thus, the + * hook can now reasonably do operations that depend on having accurate + * flow volume accounting (currently, that's just bond rebalancing). */ + if (p->ofhooks->account_checkpoint_cb) { + p->ofhooks->account_checkpoint_cb(p->aux); + } + } + + if (p->netflow) { + netflow_run(p->netflow); + } + + return 0; +} + +struct revalidate_cbdata { + struct ofproto *ofproto; + bool revalidate_all; /* Revalidate all exact-match rules? */ + bool revalidate_subrules; /* Revalidate all exact-match subrules? */ + struct tag_set revalidate_set; /* Set of tags to revalidate. */ +}; + +int +ofproto_run2(struct ofproto *p, bool revalidate_all) +{ + if (p->need_revalidate || revalidate_all + || !tag_set_is_empty(&p->revalidate_set)) { + struct revalidate_cbdata cbdata; + cbdata.ofproto = p; + cbdata.revalidate_all = revalidate_all; + cbdata.revalidate_subrules = p->need_revalidate; + cbdata.revalidate_set = p->revalidate_set; + tag_set_init(&p->revalidate_set); + COVERAGE_INC(ofproto_revalidate); + classifier_for_each(&p->cls, CLS_INC_EXACT, revalidate_cb, &cbdata); + p->need_revalidate = false; + } + + return 0; +} + +void +ofproto_wait(struct ofproto *p) +{ + struct ofconn *ofconn; + size_t i; + + dpif_recv_wait(p->dpif); + dpif_port_poll_wait(p->dpif); + netdev_monitor_poll_wait(p->netdev_monitor); + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + ofconn_wait(ofconn); + } + if (p->in_band) { + in_band_wait(p->in_band); + } + if (p->discovery) { + discovery_wait(p->discovery); + } + if (p->fail_open) { + fail_open_wait(p->fail_open); + } + pinsched_wait(p->miss_sched); + pinsched_wait(p->action_sched); + if (p->executer) { + executer_wait(p->executer); + } + if (!tag_set_is_empty(&p->revalidate_set)) { + poll_immediate_wake(); + } + if (p->need_revalidate) { + /* Shouldn't happen, but if it does just go around again. */ + VLOG_DBG_RL(&rl, "need revalidate in ofproto_wait_cb()"); + poll_immediate_wake(); + } else if (p->next_expiration != LLONG_MAX) { + poll_timer_wait(p->next_expiration - time_msec()); + } + for (i = 0; i < p->n_listeners; i++) { + pvconn_wait(p->listeners[i]); + } + for (i = 0; i < p->n_snoops; i++) { + pvconn_wait(p->snoops[i]); + } +} + +void +ofproto_revalidate(struct ofproto *ofproto, tag_type tag) +{ + tag_set_add(&ofproto->revalidate_set, tag); +} + +struct tag_set * +ofproto_get_revalidate_set(struct ofproto *ofproto) +{ + return &ofproto->revalidate_set; +} + +bool +ofproto_is_alive(const struct ofproto *p) +{ + return p->discovery || rconn_is_alive(p->controller->rconn); +} + +int +ofproto_send_packet(struct ofproto *p, const flow_t *flow, + const union ofp_action *actions, size_t n_actions, + const struct ofpbuf *packet) +{ + struct odp_actions odp_actions; + int error; + + error = xlate_actions(actions, n_actions, flow, p, packet, &odp_actions, - NULL, NULL); ++ NULL, NULL, NULL); + if (error) { + return error; + } + + /* XXX Should we translate the dpif_execute() errno value into an OpenFlow + * error code? */ + dpif_execute(p->dpif, flow->in_port, odp_actions.actions, + odp_actions.n_actions, packet); + return 0; +} + +void +ofproto_add_flow(struct ofproto *p, + const flow_t *flow, uint32_t wildcards, unsigned int priority, + const union ofp_action *actions, size_t n_actions, + int idle_timeout) +{ + struct rule *rule; - rule = rule_create(NULL, actions, n_actions, ++ rule = rule_create(p, NULL, actions, n_actions, + idle_timeout >= 0 ? idle_timeout : 5 /* XXX */, 0); + cls_rule_from_flow(&rule->cr, flow, wildcards, priority); + rule_insert(p, rule, NULL, 0); +} + +void +ofproto_delete_flow(struct ofproto *ofproto, const flow_t *flow, + uint32_t wildcards, unsigned int priority) +{ + struct rule *rule; + + rule = rule_from_cls_rule(classifier_find_rule_exactly(&ofproto->cls, + flow, wildcards, + priority)); + if (rule) { + rule_remove(ofproto, rule); + } +} + +static void +destroy_rule(struct cls_rule *rule_, void *ofproto_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct ofproto *ofproto = ofproto_; + + /* Mark the flow as not installed, even though it might really be + * installed, so that rule_remove() doesn't bother trying to uninstall it. + * There is no point in uninstalling it individually since we are about to + * blow away all the flows with dpif_flow_flush(). */ + rule->installed = false; + + rule_remove(ofproto, rule); +} + +void +ofproto_flush_flows(struct ofproto *ofproto) +{ + COVERAGE_INC(ofproto_flush); + classifier_for_each(&ofproto->cls, CLS_INC_ALL, destroy_rule, ofproto); + dpif_flow_flush(ofproto->dpif); + if (ofproto->in_band) { + in_band_flushed(ofproto->in_band); + } + if (ofproto->fail_open) { + fail_open_flushed(ofproto->fail_open); + } +} + +static void +reinit_ports(struct ofproto *p) +{ + struct svec devnames; + struct ofport *ofport; + unsigned int port_no; + struct odp_port *odp_ports; + size_t n_odp_ports; + size_t i; + + svec_init(&devnames); + PORT_ARRAY_FOR_EACH (ofport, &p->ports, port_no) { + svec_add (&devnames, (char *) ofport->opp.name); + } + dpif_port_list(p->dpif, &odp_ports, &n_odp_ports); + for (i = 0; i < n_odp_ports; i++) { + svec_add (&devnames, odp_ports[i].devname); + } + free(odp_ports); + + svec_sort_unique(&devnames); + for (i = 0; i < devnames.n; i++) { + update_port(p, devnames.names[i]); + } + svec_destroy(&devnames); +} + +static void +refresh_port_group(struct ofproto *p, unsigned int group) +{ + uint16_t *ports; + size_t n_ports; + struct ofport *port; + unsigned int port_no; + + assert(group == DP_GROUP_ALL || group == DP_GROUP_FLOOD); + + ports = xmalloc(port_array_count(&p->ports) * sizeof *ports); + n_ports = 0; + PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) { + if (group == DP_GROUP_ALL || !(port->opp.config & OFPPC_NO_FLOOD)) { + ports[n_ports++] = port_no; + } + } + dpif_port_group_set(p->dpif, group, ports, n_ports); + free(ports); +} + +static void +refresh_port_groups(struct ofproto *p) +{ + refresh_port_group(p, DP_GROUP_FLOOD); + refresh_port_group(p, DP_GROUP_ALL); +} + +static struct ofport * +make_ofport(const struct odp_port *odp_port) +{ + enum netdev_flags flags; + struct ofport *ofport; + struct netdev *netdev; + bool carrier; + int error; + + error = netdev_open(odp_port->devname, NETDEV_ETH_TYPE_NONE, &netdev); + if (error) { + VLOG_WARN_RL(&rl, "ignoring port %s (%"PRIu16") because netdev %s " + "cannot be opened (%s)", + odp_port->devname, odp_port->port, + odp_port->devname, strerror(error)); + return NULL; + } + + ofport = xmalloc(sizeof *ofport); + ofport->netdev = netdev; + ofport->opp.port_no = odp_port_to_ofp_port(odp_port->port); + netdev_get_etheraddr(netdev, ofport->opp.hw_addr); + memcpy(ofport->opp.name, odp_port->devname, + MIN(sizeof ofport->opp.name, sizeof odp_port->devname)); + ofport->opp.name[sizeof ofport->opp.name - 1] = '\0'; + + netdev_get_flags(netdev, &flags); + ofport->opp.config = flags & NETDEV_UP ? 0 : OFPPC_PORT_DOWN; + + netdev_get_carrier(netdev, &carrier); + ofport->opp.state = carrier ? 0 : OFPPS_LINK_DOWN; + + netdev_get_features(netdev, + &ofport->opp.curr, &ofport->opp.advertised, + &ofport->opp.supported, &ofport->opp.peer); + return ofport; +} + +static bool +ofport_conflicts(const struct ofproto *p, const struct odp_port *odp_port) +{ + if (port_array_get(&p->ports, odp_port->port)) { + VLOG_WARN_RL(&rl, "ignoring duplicate port %"PRIu16" in datapath", + odp_port->port); + return true; + } else if (shash_find(&p->port_by_name, odp_port->devname)) { + VLOG_WARN_RL(&rl, "ignoring duplicate device %s in datapath", + odp_port->devname); + return true; + } else { + return false; + } +} + +static int +ofport_equal(const struct ofport *a_, const struct ofport *b_) +{ + const struct ofp_phy_port *a = &a_->opp; + const struct ofp_phy_port *b = &b_->opp; + + BUILD_ASSERT_DECL(sizeof *a == 48); /* Detect ofp_phy_port changes. */ + return (a->port_no == b->port_no + && !memcmp(a->hw_addr, b->hw_addr, sizeof a->hw_addr) + && !strcmp((char *) a->name, (char *) b->name) + && a->state == b->state + && a->config == b->config + && a->curr == b->curr + && a->advertised == b->advertised + && a->supported == b->supported + && a->peer == b->peer); +} + +static void +send_port_status(struct ofproto *p, const struct ofport *ofport, + uint8_t reason) +{ + /* XXX Should limit the number of queued port status change messages. */ + struct ofconn *ofconn; + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + struct ofp_port_status *ops; + struct ofpbuf *b; + + ops = make_openflow_xid(sizeof *ops, OFPT_PORT_STATUS, 0, &b); + ops->reason = reason; + ops->desc = ofport->opp; + hton_ofp_phy_port(&ops->desc); + queue_tx(b, ofconn, NULL); + } + if (p->ofhooks->port_changed_cb) { + p->ofhooks->port_changed_cb(reason, &ofport->opp, p->aux); + } +} + +static void +ofport_install(struct ofproto *p, struct ofport *ofport) +{ + netdev_monitor_add(p->netdev_monitor, ofport->netdev); + port_array_set(&p->ports, ofp_port_to_odp_port(ofport->opp.port_no), + ofport); + shash_add(&p->port_by_name, (char *) ofport->opp.name, ofport); +} + +static void +ofport_remove(struct ofproto *p, struct ofport *ofport) +{ + netdev_monitor_remove(p->netdev_monitor, ofport->netdev); + port_array_set(&p->ports, ofp_port_to_odp_port(ofport->opp.port_no), NULL); + shash_delete(&p->port_by_name, + shash_find(&p->port_by_name, (char *) ofport->opp.name)); +} + +static void +ofport_free(struct ofport *ofport) +{ + if (ofport) { + netdev_close(ofport->netdev); + free(ofport); + } +} + +static void +update_port(struct ofproto *p, const char *devname) +{ + struct odp_port odp_port; + struct ofport *old_ofport; + struct ofport *new_ofport; + int error; + + COVERAGE_INC(ofproto_update_port); + + /* Query the datapath for port information. */ + error = dpif_port_query_by_name(p->dpif, devname, &odp_port); + + /* Find the old ofport. */ + old_ofport = shash_find_data(&p->port_by_name, devname); + if (!error) { + if (!old_ofport) { + /* There's no port named 'devname' but there might be a port with + * the same port number. This could happen if a port is deleted + * and then a new one added in its place very quickly, or if a port + * is renamed. In the former case we want to send an OFPPR_DELETE + * and an OFPPR_ADD, and in the latter case we want to send a + * single OFPPR_MODIFY. We can distinguish the cases by comparing + * the old port's ifindex against the new port, or perhaps less + * reliably but more portably by comparing the old port's MAC + * against the new port's MAC. However, this code isn't that smart + * and always sends an OFPPR_MODIFY (XXX). */ + old_ofport = port_array_get(&p->ports, odp_port.port); + } + } else if (error != ENOENT && error != ENODEV) { + VLOG_WARN_RL(&rl, "dpif_port_query_by_name returned unexpected error " + "%s", strerror(error)); + return; + } + + /* Create a new ofport. */ + new_ofport = !error ? make_ofport(&odp_port) : NULL; + + /* Eliminate a few pathological cases. */ + if (!old_ofport && !new_ofport) { + return; + } else if (old_ofport && new_ofport) { + /* Most of the 'config' bits are OpenFlow soft state, but + * OFPPC_PORT_DOWN is maintained the kernel. So transfer the OpenFlow + * bits from old_ofport. (make_ofport() only sets OFPPC_PORT_DOWN and + * leaves the other bits 0.) */ + new_ofport->opp.config |= old_ofport->opp.config & ~OFPPC_PORT_DOWN; + + if (ofport_equal(old_ofport, new_ofport)) { + /* False alarm--no change. */ + ofport_free(new_ofport); + return; + } + } + + /* Now deal with the normal cases. */ + if (old_ofport) { + ofport_remove(p, old_ofport); + } + if (new_ofport) { + ofport_install(p, new_ofport); + } + send_port_status(p, new_ofport ? new_ofport : old_ofport, + (!old_ofport ? OFPPR_ADD + : !new_ofport ? OFPPR_DELETE + : OFPPR_MODIFY)); + ofport_free(old_ofport); + + /* Update port groups. */ + refresh_port_groups(p); +} + +static int +init_ports(struct ofproto *p) +{ + struct odp_port *ports; + size_t n_ports; + size_t i; + int error; + + error = dpif_port_list(p->dpif, &ports, &n_ports); + if (error) { + return error; + } + + for (i = 0; i < n_ports; i++) { + const struct odp_port *odp_port = &ports[i]; + if (!ofport_conflicts(p, odp_port)) { + struct ofport *ofport = make_ofport(odp_port); + if (ofport) { + ofport_install(p, ofport); + } + } + } + free(ports); + refresh_port_groups(p); + return 0; +} + +static struct ofconn * +ofconn_create(struct ofproto *p, struct rconn *rconn) +{ + struct ofconn *ofconn = xmalloc(sizeof *ofconn); + list_push_back(&p->all_conns, &ofconn->node); + ofconn->rconn = rconn; + ofconn->pktbuf = NULL; + ofconn->send_flow_exp = false; + ofconn->miss_send_len = 0; + ofconn->packet_in_counter = rconn_packet_counter_create (); + ofconn->reply_counter = rconn_packet_counter_create (); + return ofconn; +} + +static void +ofconn_destroy(struct ofconn *ofconn, struct ofproto *p) +{ + if (p->executer) { + executer_rconn_closing(p->executer, ofconn->rconn); + } + + list_remove(&ofconn->node); + rconn_destroy(ofconn->rconn); + rconn_packet_counter_destroy(ofconn->packet_in_counter); + rconn_packet_counter_destroy(ofconn->reply_counter); + pktbuf_destroy(ofconn->pktbuf); + free(ofconn); +} + +static void +ofconn_run(struct ofconn *ofconn, struct ofproto *p) +{ + int iteration; + + rconn_run(ofconn->rconn); + + if (rconn_packet_counter_read (ofconn->reply_counter) < OFCONN_REPLY_MAX) { + /* Limit the number of iterations to prevent other tasks from + * starving. */ + for (iteration = 0; iteration < 50; iteration++) { + struct ofpbuf *of_msg = rconn_recv(ofconn->rconn); + if (!of_msg) { + break; + } + if (p->fail_open) { + fail_open_maybe_recover(p->fail_open); + } + handle_openflow(ofconn, p, of_msg); + ofpbuf_delete(of_msg); + } + } + + if (ofconn != p->controller && !rconn_is_alive(ofconn->rconn)) { + ofconn_destroy(ofconn, p); + } +} + +static void +ofconn_wait(struct ofconn *ofconn) +{ + rconn_run_wait(ofconn->rconn); + if (rconn_packet_counter_read (ofconn->reply_counter) < OFCONN_REPLY_MAX) { + rconn_recv_wait(ofconn->rconn); + } else { + COVERAGE_INC(ofproto_ofconn_stuck); + } +} + +/* Caller is responsible for initializing the 'cr' member of the returned + * rule. */ +static struct rule * - rule_create(struct rule *super, ++rule_create(struct ofproto *ofproto, struct rule *super, + const union ofp_action *actions, size_t n_actions, + uint16_t idle_timeout, uint16_t hard_timeout) +{ + struct rule *rule = xcalloc(1, sizeof *rule); + rule->idle_timeout = idle_timeout; + rule->hard_timeout = hard_timeout; + rule->used = rule->created = time_msec(); + rule->super = super; + if (super) { + list_push_back(&super->list, &rule->list); + } else { + list_init(&rule->list); + } + rule->n_actions = n_actions; + rule->actions = xmemdup(actions, n_actions * sizeof *actions); ++ netflow_flow_clear(&rule->nf_flow); ++ netflow_flow_update_time(ofproto->netflow, &rule->nf_flow, rule->created); ++ + return rule; +} + +static struct rule * +rule_from_cls_rule(const struct cls_rule *cls_rule) +{ + return cls_rule ? CONTAINER_OF(cls_rule, struct rule, cr) : NULL; +} + +static void +rule_free(struct rule *rule) +{ + free(rule->actions); + free(rule->odp_actions); + free(rule); +} + +/* Destroys 'rule'. If 'rule' is a subrule, also removes it from its + * super-rule's list of subrules. If 'rule' is a super-rule, also iterates + * through all of its subrules and revalidates them, destroying any that no + * longer has a super-rule (which is probably all of them). + * + * Before calling this function, the caller must make have removed 'rule' from + * the classifier. If 'rule' is an exact-match rule, the caller is also + * responsible for ensuring that it has been uninstalled from the datapath. */ +static void +rule_destroy(struct ofproto *ofproto, struct rule *rule) +{ + if (!rule->super) { + struct rule *subrule, *next; + LIST_FOR_EACH_SAFE (subrule, next, struct rule, list, &rule->list) { + revalidate_rule(ofproto, subrule); + } + } else { + list_remove(&rule->list); + } + rule_free(rule); +} + +static bool +rule_has_out_port(const struct rule *rule, uint16_t out_port) +{ + const union ofp_action *oa; + struct actions_iterator i; + + if (out_port == htons(OFPP_NONE)) { + return true; + } + for (oa = actions_first(&i, rule->actions, rule->n_actions); oa; + oa = actions_next(&i)) { + if (oa->type == htons(OFPAT_OUTPUT) && oa->output.port == out_port) { + return true; + } + } + return false; +} + +/* Executes the actions indicated by 'rule' on 'packet', which is in flow + * 'flow' and is considered to have arrived on ODP port 'in_port'. + * + * The flow that 'packet' actually contains does not need to actually match + * 'rule'; the actions in 'rule' will be applied to it either way. Likewise, + * the packet and byte counters for 'rule' will be credited for the packet sent + * out whether or not the packet actually matches 'rule'. + * + * If 'rule' is an exact-match rule and 'flow' actually equals the rule's flow, + * the caller must already have accurately composed ODP actions for it given + * 'packet' using rule_make_actions(). If 'rule' is a wildcard rule, or if + * 'rule' is an exact-match rule but 'flow' is not the rule's flow, then this + * function will compose a set of ODP actions based on 'rule''s OpenFlow + * actions and apply them to 'packet'. */ +static void +rule_execute(struct ofproto *ofproto, struct rule *rule, + struct ofpbuf *packet, const flow_t *flow) +{ + const union odp_action *actions; + size_t n_actions; + struct odp_actions a; + + /* Grab or compose the ODP actions. + * + * The special case for an exact-match 'rule' where 'flow' is not the + * rule's flow is important to avoid, e.g., sending a packet out its input + * port simply because the ODP actions were composed for the wrong + * scenario. */ + if (rule->cr.wc.wildcards || !flow_equal(flow, &rule->cr.flow)) { + struct rule *super = rule->super ? rule->super : rule; + if (xlate_actions(super->actions, super->n_actions, flow, ofproto, - packet, &a, NULL, 0)) { ++ packet, &a, NULL, 0, NULL)) { + return; + } + actions = a.actions; + n_actions = a.n_actions; + } else { + actions = rule->odp_actions; + n_actions = rule->n_odp_actions; + } + + /* Execute the ODP actions. */ + if (!dpif_execute(ofproto->dpif, flow->in_port, + actions, n_actions, packet)) { + struct odp_flow_stats stats; + flow_extract_stats(flow, packet, &stats); - update_stats(rule, &stats); ++ update_stats(ofproto, rule, &stats); + rule->used = time_msec(); ++ netflow_flow_update_time(ofproto->netflow, &rule->nf_flow, rule->used); + } +} + +static void +rule_insert(struct ofproto *p, struct rule *rule, struct ofpbuf *packet, + uint16_t in_port) +{ + struct rule *displaced_rule; + + /* Insert the rule in the classifier. */ + displaced_rule = rule_from_cls_rule(classifier_insert(&p->cls, &rule->cr)); + if (!rule->cr.wc.wildcards) { + rule_make_actions(p, rule, packet); + } + + /* Send the packet and credit it to the rule. */ + if (packet) { + flow_t flow; + flow_extract(packet, in_port, &flow); + rule_execute(p, rule, packet, &flow); + } + + /* Install the rule in the datapath only after sending the packet, to + * avoid packet reordering. */ + if (rule->cr.wc.wildcards) { + COVERAGE_INC(ofproto_add_wc_flow); + p->need_revalidate = true; + } else { + rule_install(p, rule, displaced_rule); + } + + /* Free the rule that was displaced, if any. */ + if (displaced_rule) { + rule_destroy(p, displaced_rule); + } +} + +static struct rule * +rule_create_subrule(struct ofproto *ofproto, struct rule *rule, + const flow_t *flow) +{ - struct rule *subrule = rule_create(rule, NULL, 0, ++ struct rule *subrule = rule_create(ofproto, rule, NULL, 0, + rule->idle_timeout, rule->hard_timeout); + COVERAGE_INC(ofproto_subrule_create); + cls_rule_from_flow(&subrule->cr, flow, 0, + (rule->cr.priority <= UINT16_MAX ? UINT16_MAX + : rule->cr.priority)); + classifier_insert_exact(&ofproto->cls, &subrule->cr); + + return subrule; +} + +static void +rule_remove(struct ofproto *ofproto, struct rule *rule) +{ + if (rule->cr.wc.wildcards) { + COVERAGE_INC(ofproto_del_wc_flow); + ofproto->need_revalidate = true; + } else { + rule_uninstall(ofproto, rule); + } + classifier_remove(&ofproto->cls, &rule->cr); + rule_destroy(ofproto, rule); +} + +/* Returns true if the actions changed, false otherwise. */ +static bool +rule_make_actions(struct ofproto *p, struct rule *rule, + const struct ofpbuf *packet) +{ + const struct rule *super; + struct odp_actions a; + size_t actions_len; + + assert(!rule->cr.wc.wildcards); + + super = rule->super ? rule->super : rule; + rule->tags = 0; + xlate_actions(super->actions, super->n_actions, &rule->cr.flow, p, - packet, &a, &rule->tags, &rule->may_install); ++ packet, &a, &rule->tags, &rule->may_install, ++ &rule->nf_flow.output_iface); + + actions_len = a.n_actions * sizeof *a.actions; + if (rule->n_odp_actions != a.n_actions + || memcmp(rule->odp_actions, a.actions, actions_len)) { + COVERAGE_INC(ofproto_odp_unchanged); + free(rule->odp_actions); + rule->n_odp_actions = a.n_actions; + rule->odp_actions = xmemdup(a.actions, actions_len); + return true; + } else { + return false; + } +} + +static int +do_put_flow(struct ofproto *ofproto, struct rule *rule, int flags, + struct odp_flow_put *put) +{ + memset(&put->flow.stats, 0, sizeof put->flow.stats); + put->flow.key = rule->cr.flow; + put->flow.actions = rule->odp_actions; + put->flow.n_actions = rule->n_odp_actions; + put->flags = flags; + return dpif_flow_put(ofproto->dpif, put); +} + +static void +rule_install(struct ofproto *p, struct rule *rule, struct rule *displaced_rule) +{ + assert(!rule->cr.wc.wildcards); + + if (rule->may_install) { + struct odp_flow_put put; + if (!do_put_flow(p, rule, + ODPPF_CREATE | ODPPF_MODIFY | ODPPF_ZERO_STATS, + &put)) { + rule->installed = true; + if (displaced_rule) { - update_stats(rule, &put.flow.stats); ++ update_stats(p, rule, &put.flow.stats); + rule_post_uninstall(p, displaced_rule); + } + } + } else if (displaced_rule) { + rule_uninstall(p, displaced_rule); + } +} + +static void +rule_reinstall(struct ofproto *ofproto, struct rule *rule) +{ + if (rule->installed) { + struct odp_flow_put put; + COVERAGE_INC(ofproto_dp_missed); + do_put_flow(ofproto, rule, ODPPF_CREATE | ODPPF_MODIFY, &put); + } else { + rule_install(ofproto, rule, NULL); + } +} + +static void +rule_update_actions(struct ofproto *ofproto, struct rule *rule) +{ + bool actions_changed = rule_make_actions(ofproto, rule, NULL); + if (rule->may_install) { + if (rule->installed) { + if (actions_changed) { + /* XXX should really do rule_post_uninstall() for the *old* set + * of actions, and distinguish the old stats from the new. */ + struct odp_flow_put put; + do_put_flow(ofproto, rule, ODPPF_CREATE | ODPPF_MODIFY, &put); + } + } else { + rule_install(ofproto, rule, NULL); + } + } else { + rule_uninstall(ofproto, rule); + } +} + +static void +rule_account(struct ofproto *ofproto, struct rule *rule, uint64_t extra_bytes) +{ + uint64_t total_bytes = rule->byte_count + extra_bytes; + + if (ofproto->ofhooks->account_flow_cb + && total_bytes > rule->accounted_bytes) + { + ofproto->ofhooks->account_flow_cb( + &rule->cr.flow, rule->odp_actions, rule->n_odp_actions, + total_bytes - rule->accounted_bytes, ofproto->aux); + rule->accounted_bytes = total_bytes; + } +} + +static void +rule_uninstall(struct ofproto *p, struct rule *rule) +{ + assert(!rule->cr.wc.wildcards); + if (rule->installed) { + struct odp_flow odp_flow; + + odp_flow.key = rule->cr.flow; + odp_flow.actions = NULL; + odp_flow.n_actions = 0; + if (!dpif_flow_del(p->dpif, &odp_flow)) { - update_stats(rule, &odp_flow.stats); ++ update_stats(p, rule, &odp_flow.stats); + } + rule->installed = false; + + rule_post_uninstall(p, rule); + } +} + ++static bool ++is_controller_rule(struct rule *rule) ++{ ++ /* If the only action is send to the controller then don't report ++ * NetFlow expiration messages since it is just part of the control ++ * logic for the network and not real traffic. */ ++ ++ if (rule && rule->super) { ++ struct rule *super = rule->super; ++ ++ return super->n_actions == 1 && ++ super->actions[0].type == htons(OFPAT_OUTPUT) && ++ super->actions[0].output.port == htons(OFPP_CONTROLLER); ++ } ++ ++ return false; ++} ++ +static void +rule_post_uninstall(struct ofproto *ofproto, struct rule *rule) +{ + struct rule *super = rule->super; + + rule_account(ofproto, rule, 0); - if (ofproto->netflow && rule->byte_count) { ++ ++ if (ofproto->netflow && !is_controller_rule(rule)) { + struct ofexpired expired; + expired.flow = rule->cr.flow; + expired.packet_count = rule->packet_count; + expired.byte_count = rule->byte_count; + expired.used = rule->used; - expired.created = rule->created; - expired.tcp_flags = rule->tcp_flags; - expired.ip_tos = rule->ip_tos; - netflow_expire(ofproto->netflow, &expired); ++ netflow_expire(ofproto->netflow, &rule->nf_flow, &expired); + } + if (super) { + super->packet_count += rule->packet_count; + super->byte_count += rule->byte_count; - super->tcp_flags |= rule->tcp_flags; - if (rule->packet_count) { - super->ip_tos = rule->ip_tos; - } - } + - /* Reset counters to prevent double counting if the rule ever gets - * reinstalled. */ - rule->packet_count = 0; - rule->byte_count = 0; - rule->accounted_bytes = 0; - rule->tcp_flags = 0; - rule->ip_tos = 0; ++ /* Reset counters to prevent double counting if the rule ever gets ++ * reinstalled. */ ++ rule->packet_count = 0; ++ rule->byte_count = 0; ++ rule->accounted_bytes = 0; ++ ++ netflow_flow_clear(&rule->nf_flow); ++ } +} + +static void +queue_tx(struct ofpbuf *msg, const struct ofconn *ofconn, + struct rconn_packet_counter *counter) +{ + update_openflow_length(msg); + if (rconn_send(ofconn->rconn, msg, counter)) { + ofpbuf_delete(msg); + } +} + +static void +send_error(const struct ofconn *ofconn, const struct ofp_header *oh, + int error, const void *data, size_t len) +{ + struct ofpbuf *buf; + struct ofp_error_msg *oem; + + if (!(error >> 16)) { + VLOG_WARN_RL(&rl, "not sending bad error code %d to controller", + error); + return; + } + + COVERAGE_INC(ofproto_error); + oem = make_openflow_xid(len + sizeof *oem, OFPT_ERROR, + oh ? oh->xid : 0, &buf); + oem->type = htons((unsigned int) error >> 16); + oem->code = htons(error & 0xffff); + memcpy(oem->data, data, len); + queue_tx(buf, ofconn, ofconn->reply_counter); +} + +static void +send_error_oh(const struct ofconn *ofconn, const struct ofp_header *oh, + int error) +{ + size_t oh_length = ntohs(oh->length); + send_error(ofconn, oh, error, oh, MIN(oh_length, 64)); +} + +static void +hton_ofp_phy_port(struct ofp_phy_port *opp) +{ + opp->port_no = htons(opp->port_no); + opp->config = htonl(opp->config); + opp->state = htonl(opp->state); + opp->curr = htonl(opp->curr); + opp->advertised = htonl(opp->advertised); + opp->supported = htonl(opp->supported); + opp->peer = htonl(opp->peer); +} + +static int +handle_echo_request(struct ofconn *ofconn, struct ofp_header *oh) +{ + struct ofp_header *rq = oh; + queue_tx(make_echo_reply(rq), ofconn, ofconn->reply_counter); + return 0; +} + +static int +handle_features_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_header *oh) +{ + struct ofp_switch_features *osf; + struct ofpbuf *buf; + unsigned int port_no; + struct ofport *port; + + osf = make_openflow_xid(sizeof *osf, OFPT_FEATURES_REPLY, oh->xid, &buf); + osf->datapath_id = htonll(p->datapath_id); + osf->n_buffers = htonl(pktbuf_capacity()); + osf->n_tables = 2; + osf->capabilities = htonl(OFPC_FLOW_STATS | OFPC_TABLE_STATS | + OFPC_PORT_STATS | OFPC_MULTI_PHY_TX); + osf->actions = htonl((1u << OFPAT_OUTPUT) | + (1u << OFPAT_SET_VLAN_VID) | + (1u << OFPAT_SET_VLAN_PCP) | + (1u << OFPAT_STRIP_VLAN) | + (1u << OFPAT_SET_DL_SRC) | + (1u << OFPAT_SET_DL_DST) | + (1u << OFPAT_SET_NW_SRC) | + (1u << OFPAT_SET_NW_DST) | + (1u << OFPAT_SET_TP_SRC) | + (1u << OFPAT_SET_TP_DST)); + + PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) { + hton_ofp_phy_port(ofpbuf_put(buf, &port->opp, sizeof port->opp)); + } + + queue_tx(buf, ofconn, ofconn->reply_counter); + return 0; +} + +static int +handle_get_config_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_header *oh) +{ + struct ofpbuf *buf; + struct ofp_switch_config *osc; + uint16_t flags; + bool drop_frags; + + /* Figure out flags. */ + dpif_get_drop_frags(p->dpif, &drop_frags); + flags = drop_frags ? OFPC_FRAG_DROP : OFPC_FRAG_NORMAL; + if (ofconn->send_flow_exp) { + flags |= OFPC_SEND_FLOW_EXP; + } + + /* Send reply. */ + osc = make_openflow_xid(sizeof *osc, OFPT_GET_CONFIG_REPLY, oh->xid, &buf); + osc->flags = htons(flags); + osc->miss_send_len = htons(ofconn->miss_send_len); + queue_tx(buf, ofconn, ofconn->reply_counter); + + return 0; +} + +static int +handle_set_config(struct ofproto *p, struct ofconn *ofconn, + struct ofp_switch_config *osc) +{ + uint16_t flags; + int error; + + error = check_ofp_message(&osc->header, OFPT_SET_CONFIG, sizeof *osc); + if (error) { + return error; + } + flags = ntohs(osc->flags); + + ofconn->send_flow_exp = (flags & OFPC_SEND_FLOW_EXP) != 0; + + if (ofconn == p->controller) { + switch (flags & OFPC_FRAG_MASK) { + case OFPC_FRAG_NORMAL: + dpif_set_drop_frags(p->dpif, false); + break; + case OFPC_FRAG_DROP: + dpif_set_drop_frags(p->dpif, true); + break; + default: + VLOG_WARN_RL(&rl, "requested bad fragment mode (flags=%"PRIx16")", + osc->flags); + break; + } + } + + if ((ntohs(osc->miss_send_len) != 0) != (ofconn->miss_send_len != 0)) { + if (ntohs(osc->miss_send_len) != 0) { + ofconn->pktbuf = pktbuf_create(); + } else { + pktbuf_destroy(ofconn->pktbuf); + } + } + + ofconn->miss_send_len = ntohs(osc->miss_send_len); + + return 0; +} + +static void - add_output_group_action(struct odp_actions *actions, uint16_t group) ++add_output_group_action(struct odp_actions *actions, uint16_t group, ++ uint16_t *nf_output_iface) +{ + odp_actions_add(actions, ODPAT_OUTPUT_GROUP)->output_group.group = group; ++ ++ if (group == DP_GROUP_ALL || group == DP_GROUP_FLOOD) { ++ *nf_output_iface = NF_OUT_FLOOD; ++ } +} + +static void +add_controller_action(struct odp_actions *actions, + const struct ofp_action_output *oao) +{ + union odp_action *a = odp_actions_add(actions, ODPAT_CONTROLLER); + a->controller.arg = oao->max_len ? ntohs(oao->max_len) : UINT32_MAX; +} + +struct action_xlate_ctx { + /* Input. */ + const flow_t *flow; /* Flow to which these actions correspond. */ + int recurse; /* Recursion level, via xlate_table_action. */ + struct ofproto *ofproto; + const struct ofpbuf *packet; /* The packet corresponding to 'flow', or a + * null pointer if we are revalidating + * without a packet to refer to. */ + + /* Output. */ + struct odp_actions *out; /* Datapath actions. */ + tag_type *tags; /* Tags associated with OFPP_NORMAL actions. */ - bool may_setup_flow; /* True ordinarily; false if the actions must ++ bool may_set_up_flow; /* True ordinarily; false if the actions must + * be reassessed for every packet. */ ++ uint16_t nf_output_iface; /* Output interface index for NetFlow. */ +}; + +static void do_xlate_actions(const union ofp_action *in, size_t n_in, + struct action_xlate_ctx *ctx); + +static void +add_output_action(struct action_xlate_ctx *ctx, uint16_t port) +{ + const struct ofport *ofport = port_array_get(&ctx->ofproto->ports, port); + + if (ofport) { + if (ofport->opp.config & OFPPC_NO_FWD) { + /* Forwarding disabled on port. */ + return; + } + } else { + /* + * We don't have an ofport record for this port, but it doesn't hurt to + * allow forwarding to it anyhow. Maybe such a port will appear later + * and we're pre-populating the flow table. + */ + } + + odp_actions_add(ctx->out, ODPAT_OUTPUT)->output.port = port; ++ ctx->nf_output_iface = port; +} + +static struct rule * +lookup_valid_rule(struct ofproto *ofproto, const flow_t *flow) +{ + struct rule *rule; + rule = rule_from_cls_rule(classifier_lookup(&ofproto->cls, flow)); + + /* The rule we found might not be valid, since we could be in need of + * revalidation. If it is not valid, don't return it. */ + if (rule + && rule->super + && ofproto->need_revalidate + && !revalidate_rule(ofproto, rule)) { + COVERAGE_INC(ofproto_invalidated); + return NULL; + } + + return rule; +} + +static void +xlate_table_action(struct action_xlate_ctx *ctx, uint16_t in_port) +{ + if (!ctx->recurse) { + struct rule *rule; + flow_t flow; + + flow = *ctx->flow; + flow.in_port = in_port; + + rule = lookup_valid_rule(ctx->ofproto, &flow); + if (rule) { + if (rule->super) { + rule = rule->super; + } + + ctx->recurse++; + do_xlate_actions(rule->actions, rule->n_actions, ctx); + ctx->recurse--; + } + } +} + +static void +xlate_output_action(struct action_xlate_ctx *ctx, + const struct ofp_action_output *oao) +{ + uint16_t odp_port; ++ uint16_t prev_nf_output_iface = ctx->nf_output_iface; ++ ++ ctx->nf_output_iface = NF_OUT_DROP; + + switch (ntohs(oao->port)) { + case OFPP_IN_PORT: + add_output_action(ctx, ctx->flow->in_port); + break; + case OFPP_TABLE: + xlate_table_action(ctx, ctx->flow->in_port); + break; + case OFPP_NORMAL: + if (!ctx->ofproto->ofhooks->normal_cb(ctx->flow, ctx->packet, + ctx->out, ctx->tags, ++ &ctx->nf_output_iface, + ctx->ofproto->aux)) { + COVERAGE_INC(ofproto_uninstallable); - ctx->may_setup_flow = false; ++ ctx->may_set_up_flow = false; + } + break; + case OFPP_FLOOD: - add_output_group_action(ctx->out, DP_GROUP_FLOOD); ++ add_output_group_action(ctx->out, DP_GROUP_FLOOD, ++ &ctx->nf_output_iface); + break; + case OFPP_ALL: - add_output_group_action(ctx->out, DP_GROUP_ALL); ++ add_output_group_action(ctx->out, DP_GROUP_ALL, &ctx->nf_output_iface); + break; + case OFPP_CONTROLLER: + add_controller_action(ctx->out, oao); + break; + case OFPP_LOCAL: + add_output_action(ctx, ODPP_LOCAL); + break; + default: + odp_port = ofp_port_to_odp_port(ntohs(oao->port)); + if (odp_port != ctx->flow->in_port) { + add_output_action(ctx, odp_port); + } + break; + } ++ ++ if (prev_nf_output_iface == NF_OUT_FLOOD) { ++ ctx->nf_output_iface = NF_OUT_FLOOD; ++ } else if (ctx->nf_output_iface == NF_OUT_DROP) { ++ ctx->nf_output_iface = prev_nf_output_iface; ++ } else if (prev_nf_output_iface != NF_OUT_DROP && ++ ctx->nf_output_iface != NF_OUT_FLOOD) { ++ ctx->nf_output_iface = NF_OUT_MULTI; ++ } +} + +static void +xlate_nicira_action(struct action_xlate_ctx *ctx, + const struct nx_action_header *nah) +{ + const struct nx_action_resubmit *nar; + int subtype = ntohs(nah->subtype); + + assert(nah->vendor == htonl(NX_VENDOR_ID)); + switch (subtype) { + case NXAST_RESUBMIT: + nar = (const struct nx_action_resubmit *) nah; + xlate_table_action(ctx, ofp_port_to_odp_port(ntohs(nar->in_port))); + break; + + default: + VLOG_DBG_RL(&rl, "unknown Nicira action type %"PRIu16, subtype); + break; + } +} + +static void +do_xlate_actions(const union ofp_action *in, size_t n_in, + struct action_xlate_ctx *ctx) +{ + struct actions_iterator iter; + const union ofp_action *ia; + const struct ofport *port; + + port = port_array_get(&ctx->ofproto->ports, ctx->flow->in_port); + if (port && port->opp.config & (OFPPC_NO_RECV | OFPPC_NO_RECV_STP) && + port->opp.config & (eth_addr_equals(ctx->flow->dl_dst, stp_eth_addr) + ? OFPPC_NO_RECV_STP : OFPPC_NO_RECV)) { + /* Drop this flow. */ + return; + } + + for (ia = actions_first(&iter, in, n_in); ia; ia = actions_next(&iter)) { + uint16_t type = ntohs(ia->type); + union odp_action *oa; + + switch (type) { + case OFPAT_OUTPUT: + xlate_output_action(ctx, &ia->output); + break; + + case OFPAT_SET_VLAN_VID: + oa = odp_actions_add(ctx->out, ODPAT_SET_VLAN_VID); + oa->vlan_vid.vlan_vid = ia->vlan_vid.vlan_vid; + break; + + case OFPAT_SET_VLAN_PCP: + oa = odp_actions_add(ctx->out, ODPAT_SET_VLAN_PCP); + oa->vlan_pcp.vlan_pcp = ia->vlan_pcp.vlan_pcp; + break; + + case OFPAT_STRIP_VLAN: + odp_actions_add(ctx->out, ODPAT_STRIP_VLAN); + break; + + case OFPAT_SET_DL_SRC: + oa = odp_actions_add(ctx->out, ODPAT_SET_DL_SRC); + memcpy(oa->dl_addr.dl_addr, + ((struct ofp_action_dl_addr *) ia)->dl_addr, ETH_ADDR_LEN); + break; + + case OFPAT_SET_DL_DST: + oa = odp_actions_add(ctx->out, ODPAT_SET_DL_DST); + memcpy(oa->dl_addr.dl_addr, + ((struct ofp_action_dl_addr *) ia)->dl_addr, ETH_ADDR_LEN); + break; + + case OFPAT_SET_NW_SRC: + oa = odp_actions_add(ctx->out, ODPAT_SET_NW_SRC); + oa->nw_addr.nw_addr = ia->nw_addr.nw_addr; + break; + + case OFPAT_SET_TP_SRC: + oa = odp_actions_add(ctx->out, ODPAT_SET_TP_SRC); + oa->tp_port.tp_port = ia->tp_port.tp_port; + break; + + case OFPAT_VENDOR: + xlate_nicira_action(ctx, (const struct nx_action_header *) ia); + break; + + default: + VLOG_DBG_RL(&rl, "unknown action type %"PRIu16, type); + break; + } + } +} + +static int +xlate_actions(const union ofp_action *in, size_t n_in, + const flow_t *flow, struct ofproto *ofproto, + const struct ofpbuf *packet, - struct odp_actions *out, tag_type *tags, bool *may_setup_flow) ++ struct odp_actions *out, tag_type *tags, bool *may_set_up_flow, ++ uint16_t *nf_output_iface) +{ + tag_type no_tags = 0; + struct action_xlate_ctx ctx; + COVERAGE_INC(ofproto_ofp2odp); + odp_actions_init(out); + ctx.flow = flow; + ctx.recurse = 0; + ctx.ofproto = ofproto; + ctx.packet = packet; + ctx.out = out; + ctx.tags = tags ? tags : &no_tags; - ctx.may_setup_flow = true; ++ ctx.may_set_up_flow = true; ++ ctx.nf_output_iface = NF_OUT_DROP; + do_xlate_actions(in, n_in, &ctx); + - /* Check with in-band control to see if we're allowed to setup this ++ /* Check with in-band control to see if we're allowed to set up this + * flow. */ + if (!in_band_rule_check(ofproto->in_band, flow, out)) { - ctx.may_setup_flow = false; ++ ctx.may_set_up_flow = false; + } + - if (may_setup_flow) { - *may_setup_flow = ctx.may_setup_flow; ++ if (may_set_up_flow) { ++ *may_set_up_flow = ctx.may_set_up_flow; ++ } ++ if (nf_output_iface) { ++ *nf_output_iface = ctx.nf_output_iface; + } + if (odp_actions_overflow(out)) { + odp_actions_init(out); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_TOO_MANY); + } + return 0; +} + +static int +handle_packet_out(struct ofproto *p, struct ofconn *ofconn, + struct ofp_header *oh) +{ + struct ofp_packet_out *opo; + struct ofpbuf payload, *buffer; + struct odp_actions actions; + int n_actions; + uint16_t in_port; + flow_t flow; + int error; + + error = check_ofp_packet_out(oh, &payload, &n_actions, p->max_ports); + if (error) { + return error; + } + opo = (struct ofp_packet_out *) oh; + + COVERAGE_INC(ofproto_packet_out); + if (opo->buffer_id != htonl(UINT32_MAX)) { + error = pktbuf_retrieve(ofconn->pktbuf, ntohl(opo->buffer_id), + &buffer, &in_port); + if (error || !buffer) { + return error; + } + payload = *buffer; + } else { + buffer = NULL; + } + + flow_extract(&payload, ofp_port_to_odp_port(ntohs(opo->in_port)), &flow); + error = xlate_actions((const union ofp_action *) opo->actions, n_actions, - &flow, p, &payload, &actions, NULL, NULL); ++ &flow, p, &payload, &actions, NULL, NULL, NULL); + if (error) { + return error; + } + + dpif_execute(p->dpif, flow.in_port, actions.actions, actions.n_actions, + &payload); + ofpbuf_delete(buffer); + + return 0; +} + +static void +update_port_config(struct ofproto *p, struct ofport *port, + uint32_t config, uint32_t mask) +{ + mask &= config ^ port->opp.config; + if (mask & OFPPC_PORT_DOWN) { + if (config & OFPPC_PORT_DOWN) { + netdev_turn_flags_off(port->netdev, NETDEV_UP, true); + } else { + netdev_turn_flags_on(port->netdev, NETDEV_UP, true); + } + } +#define REVALIDATE_BITS (OFPPC_NO_RECV | OFPPC_NO_RECV_STP | OFPPC_NO_FWD) + if (mask & REVALIDATE_BITS) { + COVERAGE_INC(ofproto_costly_flags); + port->opp.config ^= mask & REVALIDATE_BITS; + p->need_revalidate = true; + } +#undef REVALIDATE_BITS + if (mask & OFPPC_NO_FLOOD) { + port->opp.config ^= OFPPC_NO_FLOOD; + refresh_port_group(p, DP_GROUP_FLOOD); + } + if (mask & OFPPC_NO_PACKET_IN) { + port->opp.config ^= OFPPC_NO_PACKET_IN; + } +} + +static int +handle_port_mod(struct ofproto *p, struct ofp_header *oh) +{ + const struct ofp_port_mod *opm; + struct ofport *port; + int error; + + error = check_ofp_message(oh, OFPT_PORT_MOD, sizeof *opm); + if (error) { + return error; + } + opm = (struct ofp_port_mod *) oh; + + port = port_array_get(&p->ports, + ofp_port_to_odp_port(ntohs(opm->port_no))); + if (!port) { + return ofp_mkerr(OFPET_PORT_MOD_FAILED, OFPPMFC_BAD_PORT); + } else if (memcmp(port->opp.hw_addr, opm->hw_addr, OFP_ETH_ALEN)) { + return ofp_mkerr(OFPET_PORT_MOD_FAILED, OFPPMFC_BAD_HW_ADDR); + } else { + update_port_config(p, port, ntohl(opm->config), ntohl(opm->mask)); + if (opm->advertise) { + netdev_set_advertisements(port->netdev, ntohl(opm->advertise)); + } + } + return 0; +} + +static struct ofpbuf * +make_stats_reply(uint32_t xid, uint16_t type, size_t body_len) +{ + struct ofp_stats_reply *osr; + struct ofpbuf *msg; + + msg = ofpbuf_new(MIN(sizeof *osr + body_len, UINT16_MAX)); + osr = put_openflow_xid(sizeof *osr, OFPT_STATS_REPLY, xid, msg); + osr->type = type; + osr->flags = htons(0); + return msg; +} + +static struct ofpbuf * +start_stats_reply(const struct ofp_stats_request *request, size_t body_len) +{ + return make_stats_reply(request->header.xid, request->type, body_len); +} + +static void * +append_stats_reply(size_t nbytes, struct ofconn *ofconn, struct ofpbuf **msgp) +{ + struct ofpbuf *msg = *msgp; + assert(nbytes <= UINT16_MAX - sizeof(struct ofp_stats_reply)); + if (nbytes + msg->size > UINT16_MAX) { + struct ofp_stats_reply *reply = msg->data; + reply->flags = htons(OFPSF_REPLY_MORE); + *msgp = make_stats_reply(reply->header.xid, reply->type, nbytes); + queue_tx(msg, ofconn, ofconn->reply_counter); + } + return ofpbuf_put_uninit(*msgp, nbytes); +} + +static int +handle_desc_stats_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_stats_request *request) +{ + struct ofp_desc_stats *ods; + struct ofpbuf *msg; + + msg = start_stats_reply(request, sizeof *ods); + ods = append_stats_reply(sizeof *ods, ofconn, &msg); + strncpy(ods->mfr_desc, p->manufacturer, sizeof ods->mfr_desc); + strncpy(ods->hw_desc, p->hardware, sizeof ods->hw_desc); + strncpy(ods->sw_desc, p->software, sizeof ods->sw_desc); + strncpy(ods->serial_num, p->serial, sizeof ods->serial_num); + queue_tx(msg, ofconn, ofconn->reply_counter); + + return 0; +} + +static void +count_subrules(struct cls_rule *cls_rule, void *n_subrules_) +{ + struct rule *rule = rule_from_cls_rule(cls_rule); + int *n_subrules = n_subrules_; + + if (rule->super) { + (*n_subrules)++; + } +} + +static int +handle_table_stats_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_stats_request *request) +{ + struct ofp_table_stats *ots; + struct ofpbuf *msg; + struct odp_stats dpstats; + int n_exact, n_subrules, n_wild; + + msg = start_stats_reply(request, sizeof *ots * 2); + + /* Count rules of various kinds. */ + n_subrules = 0; + classifier_for_each(&p->cls, CLS_INC_EXACT, count_subrules, &n_subrules); + n_exact = classifier_count_exact(&p->cls) - n_subrules; + n_wild = classifier_count(&p->cls) - classifier_count_exact(&p->cls); + + /* Hash table. */ + dpif_get_dp_stats(p->dpif, &dpstats); + ots = append_stats_reply(sizeof *ots, ofconn, &msg); + memset(ots, 0, sizeof *ots); + ots->table_id = TABLEID_HASH; + strcpy(ots->name, "hash"); + ots->wildcards = htonl(0); + ots->max_entries = htonl(dpstats.max_capacity); + ots->active_count = htonl(n_exact); + ots->lookup_count = htonll(dpstats.n_frags + dpstats.n_hit + + dpstats.n_missed); + ots->matched_count = htonll(dpstats.n_hit); /* XXX */ + + /* Classifier table. */ + ots = append_stats_reply(sizeof *ots, ofconn, &msg); + memset(ots, 0, sizeof *ots); + ots->table_id = TABLEID_CLASSIFIER; + strcpy(ots->name, "classifier"); + ots->wildcards = htonl(OFPFW_ALL); + ots->max_entries = htonl(65536); + ots->active_count = htonl(n_wild); + ots->lookup_count = htonll(0); /* XXX */ + ots->matched_count = htonll(0); /* XXX */ + + queue_tx(msg, ofconn, ofconn->reply_counter); + return 0; +} + +static int +handle_port_stats_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_stats_request *request) +{ + struct ofp_port_stats *ops; + struct ofpbuf *msg; + struct ofport *port; + unsigned int port_no; + + msg = start_stats_reply(request, sizeof *ops * 16); + PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) { + struct netdev_stats stats; + + /* Intentionally ignore return value, since errors will set 'stats' to + * all-1s, which is correct for OpenFlow, and netdev_get_stats() will + * log errors. */ + netdev_get_stats(port->netdev, &stats); + + ops = append_stats_reply(sizeof *ops, ofconn, &msg); + ops->port_no = htons(odp_port_to_ofp_port(port_no)); + memset(ops->pad, 0, sizeof ops->pad); + ops->rx_packets = htonll(stats.rx_packets); + ops->tx_packets = htonll(stats.tx_packets); + ops->rx_bytes = htonll(stats.rx_bytes); + ops->tx_bytes = htonll(stats.tx_bytes); + ops->rx_dropped = htonll(stats.rx_dropped); + ops->tx_dropped = htonll(stats.tx_dropped); + ops->rx_errors = htonll(stats.rx_errors); + ops->tx_errors = htonll(stats.tx_errors); + ops->rx_frame_err = htonll(stats.rx_frame_errors); + ops->rx_over_err = htonll(stats.rx_over_errors); + ops->rx_crc_err = htonll(stats.rx_crc_errors); + ops->collisions = htonll(stats.collisions); + } + + queue_tx(msg, ofconn, ofconn->reply_counter); + return 0; +} + +struct flow_stats_cbdata { + struct ofproto *ofproto; + struct ofconn *ofconn; + uint16_t out_port; + struct ofpbuf *msg; +}; + +static void +query_stats(struct ofproto *p, struct rule *rule, + uint64_t *packet_countp, uint64_t *byte_countp) +{ + uint64_t packet_count, byte_count; + struct rule *subrule; + struct odp_flow *odp_flows; + size_t n_odp_flows; + + packet_count = rule->packet_count; + byte_count = rule->byte_count; + + n_odp_flows = rule->cr.wc.wildcards ? list_size(&rule->list) : 1; + odp_flows = xcalloc(1, n_odp_flows * sizeof *odp_flows); + if (rule->cr.wc.wildcards) { + size_t i = 0; + LIST_FOR_EACH (subrule, struct rule, list, &rule->list) { + odp_flows[i++].key = subrule->cr.flow; + packet_count += subrule->packet_count; + byte_count += subrule->byte_count; + } + } else { + odp_flows[0].key = rule->cr.flow; + } + + packet_count = rule->packet_count; + byte_count = rule->byte_count; + if (!dpif_flow_get_multiple(p->dpif, odp_flows, n_odp_flows)) { + size_t i; + for (i = 0; i < n_odp_flows; i++) { + struct odp_flow *odp_flow = &odp_flows[i]; + packet_count += odp_flow->stats.n_packets; + byte_count += odp_flow->stats.n_bytes; + } + } + free(odp_flows); + + *packet_countp = packet_count; + *byte_countp = byte_count; +} + +static void +flow_stats_cb(struct cls_rule *rule_, void *cbdata_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct flow_stats_cbdata *cbdata = cbdata_; + struct ofp_flow_stats *ofs; + uint64_t packet_count, byte_count; + size_t act_len, len; + + if (rule_is_hidden(rule) || !rule_has_out_port(rule, cbdata->out_port)) { + return; + } + + act_len = sizeof *rule->actions * rule->n_actions; + len = offsetof(struct ofp_flow_stats, actions) + act_len; + + query_stats(cbdata->ofproto, rule, &packet_count, &byte_count); + + ofs = append_stats_reply(len, cbdata->ofconn, &cbdata->msg); + ofs->length = htons(len); + ofs->table_id = rule->cr.wc.wildcards ? TABLEID_CLASSIFIER : TABLEID_HASH; + ofs->pad = 0; + flow_to_match(&rule->cr.flow, rule->cr.wc.wildcards, &ofs->match); + ofs->duration = htonl((time_msec() - rule->created) / 1000); + ofs->priority = htons(rule->cr.priority); + ofs->idle_timeout = htons(rule->idle_timeout); + ofs->hard_timeout = htons(rule->hard_timeout); + memset(ofs->pad2, 0, sizeof ofs->pad2); + ofs->packet_count = htonll(packet_count); + ofs->byte_count = htonll(byte_count); + memcpy(ofs->actions, rule->actions, act_len); +} + +static int +table_id_to_include(uint8_t table_id) +{ + return (table_id == TABLEID_HASH ? CLS_INC_EXACT + : table_id == TABLEID_CLASSIFIER ? CLS_INC_WILD + : table_id == 0xff ? CLS_INC_ALL + : 0); +} + +static int +handle_flow_stats_request(struct ofproto *p, struct ofconn *ofconn, + const struct ofp_stats_request *osr, + size_t arg_size) +{ + struct ofp_flow_stats_request *fsr; + struct flow_stats_cbdata cbdata; + struct cls_rule target; + + if (arg_size != sizeof *fsr) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + fsr = (struct ofp_flow_stats_request *) osr->body; + + COVERAGE_INC(ofproto_flows_req); + cbdata.ofproto = p; + cbdata.ofconn = ofconn; + cbdata.out_port = fsr->out_port; + cbdata.msg = start_stats_reply(osr, 1024); + cls_rule_from_match(&target, &fsr->match, 0); + classifier_for_each_match(&p->cls, &target, + table_id_to_include(fsr->table_id), + flow_stats_cb, &cbdata); + queue_tx(cbdata.msg, ofconn, ofconn->reply_counter); + return 0; +} + +struct flow_stats_ds_cbdata { + struct ofproto *ofproto; + struct ds *results; +}; + +static void +flow_stats_ds_cb(struct cls_rule *rule_, void *cbdata_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct flow_stats_ds_cbdata *cbdata = cbdata_; + struct ds *results = cbdata->results; + struct ofp_match match; + uint64_t packet_count, byte_count; + size_t act_len = sizeof *rule->actions * rule->n_actions; + + /* Don't report on subrules. */ + if (rule->super != NULL) { + return; + } + + query_stats(cbdata->ofproto, rule, &packet_count, &byte_count); + flow_to_ovs_match(&rule->cr.flow, rule->cr.wc.wildcards, &match); + + ds_put_format(results, "duration=%llds, ", + (time_msec() - rule->created) / 1000); + ds_put_format(results, "priority=%u, ", rule->cr.priority); + ds_put_format(results, "n_packets=%"PRIu64", ", packet_count); + ds_put_format(results, "n_bytes=%"PRIu64", ", byte_count); + ofp_print_match(results, &match, true); + ofp_print_actions(results, &rule->actions->header, act_len); + ds_put_cstr(results, "\n"); +} + +/* Adds a pretty-printed description of all flows to 'results', including + * those marked hidden by secchan (e.g., by in-band control). */ +void +ofproto_get_all_flows(struct ofproto *p, struct ds *results) +{ + struct ofp_match match; + struct cls_rule target; + struct flow_stats_ds_cbdata cbdata; + + memset(&match, 0, sizeof match); + match.wildcards = htonl(OFPFW_ALL); + + cbdata.ofproto = p; + cbdata.results = results; + + cls_rule_from_match(&target, &match, 0); + classifier_for_each_match(&p->cls, &target, CLS_INC_ALL, + flow_stats_ds_cb, &cbdata); +} + +struct aggregate_stats_cbdata { + struct ofproto *ofproto; + uint16_t out_port; + uint64_t packet_count; + uint64_t byte_count; + uint32_t n_flows; +}; + +static void +aggregate_stats_cb(struct cls_rule *rule_, void *cbdata_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct aggregate_stats_cbdata *cbdata = cbdata_; + uint64_t packet_count, byte_count; + + if (rule_is_hidden(rule) || !rule_has_out_port(rule, cbdata->out_port)) { + return; + } + + query_stats(cbdata->ofproto, rule, &packet_count, &byte_count); + + cbdata->packet_count += packet_count; + cbdata->byte_count += byte_count; + cbdata->n_flows++; +} + +static int +handle_aggregate_stats_request(struct ofproto *p, struct ofconn *ofconn, + const struct ofp_stats_request *osr, + size_t arg_size) +{ + struct ofp_aggregate_stats_request *asr; + struct ofp_aggregate_stats_reply *reply; + struct aggregate_stats_cbdata cbdata; + struct cls_rule target; + struct ofpbuf *msg; + + if (arg_size != sizeof *asr) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + asr = (struct ofp_aggregate_stats_request *) osr->body; + + COVERAGE_INC(ofproto_agg_request); + cbdata.ofproto = p; + cbdata.out_port = asr->out_port; + cbdata.packet_count = 0; + cbdata.byte_count = 0; + cbdata.n_flows = 0; + cls_rule_from_match(&target, &asr->match, 0); + classifier_for_each_match(&p->cls, &target, + table_id_to_include(asr->table_id), + aggregate_stats_cb, &cbdata); + + msg = start_stats_reply(osr, sizeof *reply); + reply = append_stats_reply(sizeof *reply, ofconn, &msg); + reply->flow_count = htonl(cbdata.n_flows); + reply->packet_count = htonll(cbdata.packet_count); + reply->byte_count = htonll(cbdata.byte_count); + queue_tx(msg, ofconn, ofconn->reply_counter); + return 0; +} + +static int +handle_stats_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_header *oh) +{ + struct ofp_stats_request *osr; + size_t arg_size; + int error; + + error = check_ofp_message_array(oh, OFPT_STATS_REQUEST, sizeof *osr, + 1, &arg_size); + if (error) { + return error; + } + osr = (struct ofp_stats_request *) oh; + + switch (ntohs(osr->type)) { + case OFPST_DESC: + return handle_desc_stats_request(p, ofconn, osr); + + case OFPST_FLOW: + return handle_flow_stats_request(p, ofconn, osr, arg_size); + + case OFPST_AGGREGATE: + return handle_aggregate_stats_request(p, ofconn, osr, arg_size); + + case OFPST_TABLE: + return handle_table_stats_request(p, ofconn, osr); + + case OFPST_PORT: + return handle_port_stats_request(p, ofconn, osr); + + case OFPST_VENDOR: + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR); + + default: + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_STAT); + } +} + +static long long int +msec_from_nsec(uint64_t sec, uint32_t nsec) +{ + return !sec ? 0 : sec * 1000 + nsec / 1000000; +} + +static void - update_time(struct rule *rule, const struct odp_flow_stats *stats) ++update_time(struct ofproto *ofproto, struct rule *rule, ++ const struct odp_flow_stats *stats) +{ + long long int used = msec_from_nsec(stats->used_sec, stats->used_nsec); + if (used > rule->used) { + rule->used = used; ++ netflow_flow_update_time(ofproto->netflow, &rule->nf_flow, used); + } +} + +static void - update_stats(struct rule *rule, const struct odp_flow_stats *stats) ++update_stats(struct ofproto *ofproto, struct rule *rule, ++ const struct odp_flow_stats *stats) +{ - update_time(rule, stats); - rule->packet_count += stats->n_packets; - rule->byte_count += stats->n_bytes; - rule->tcp_flags |= stats->tcp_flags; + if (stats->n_packets) { - rule->ip_tos = stats->ip_tos; ++ update_time(ofproto, rule, stats); ++ rule->packet_count += stats->n_packets; ++ rule->byte_count += stats->n_bytes; ++ netflow_flow_update_flags(&rule->nf_flow, stats->ip_tos, ++ stats->tcp_flags); + } +} + +static int +add_flow(struct ofproto *p, struct ofconn *ofconn, + struct ofp_flow_mod *ofm, size_t n_actions) +{ + struct ofpbuf *packet; + struct rule *rule; + uint16_t in_port; + int error; + - rule = rule_create(NULL, (const union ofp_action *) ofm->actions, ++ rule = rule_create(p, NULL, (const union ofp_action *) ofm->actions, + n_actions, ntohs(ofm->idle_timeout), + ntohs(ofm->hard_timeout)); + cls_rule_from_match(&rule->cr, &ofm->match, ntohs(ofm->priority)); + + packet = NULL; + error = 0; + if (ofm->buffer_id != htonl(UINT32_MAX)) { + error = pktbuf_retrieve(ofconn->pktbuf, ntohl(ofm->buffer_id), + &packet, &in_port); + } + + rule_insert(p, rule, packet, in_port); + ofpbuf_delete(packet); + return error; +} + +static int +modify_flow(struct ofproto *p, const struct ofp_flow_mod *ofm, + size_t n_actions, uint16_t command, struct rule *rule) +{ + if (rule_is_hidden(rule)) { + return 0; + } + + if (command == OFPFC_DELETE) { + rule_remove(p, rule); + } else { + size_t actions_len = n_actions * sizeof *rule->actions; + + if (n_actions == rule->n_actions + && !memcmp(ofm->actions, rule->actions, actions_len)) + { + return 0; + } + + free(rule->actions); + rule->actions = xmemdup(ofm->actions, actions_len); + rule->n_actions = n_actions; + + if (rule->cr.wc.wildcards) { + COVERAGE_INC(ofproto_mod_wc_flow); + p->need_revalidate = true; + } else { + rule_update_actions(p, rule); + } + } + + return 0; +} + +static int +modify_flows_strict(struct ofproto *p, const struct ofp_flow_mod *ofm, + size_t n_actions, uint16_t command) +{ + struct rule *rule; + uint32_t wildcards; + flow_t flow; + + flow_from_match(&flow, &wildcards, &ofm->match); + rule = rule_from_cls_rule(classifier_find_rule_exactly( + &p->cls, &flow, wildcards, + ntohs(ofm->priority))); + + if (rule) { + if (command == OFPFC_DELETE + && ofm->out_port != htons(OFPP_NONE) + && !rule_has_out_port(rule, ofm->out_port)) { + return 0; + } + + modify_flow(p, ofm, n_actions, command, rule); + } + return 0; +} + +struct modify_flows_cbdata { + struct ofproto *ofproto; + const struct ofp_flow_mod *ofm; + uint16_t out_port; + size_t n_actions; + uint16_t command; +}; + +static void +modify_flows_cb(struct cls_rule *rule_, void *cbdata_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct modify_flows_cbdata *cbdata = cbdata_; + + if (cbdata->out_port != htons(OFPP_NONE) + && !rule_has_out_port(rule, cbdata->out_port)) { + return; + } + + modify_flow(cbdata->ofproto, cbdata->ofm, cbdata->n_actions, + cbdata->command, rule); +} + +static int +modify_flows_loose(struct ofproto *p, const struct ofp_flow_mod *ofm, + size_t n_actions, uint16_t command) +{ + struct modify_flows_cbdata cbdata; + struct cls_rule target; + + cbdata.ofproto = p; + cbdata.ofm = ofm; + cbdata.out_port = (command == OFPFC_DELETE ? ofm->out_port + : htons(OFPP_NONE)); + cbdata.n_actions = n_actions; + cbdata.command = command; + + cls_rule_from_match(&target, &ofm->match, 0); + + classifier_for_each_match(&p->cls, &target, CLS_INC_ALL, + modify_flows_cb, &cbdata); + return 0; +} + +static int +handle_flow_mod(struct ofproto *p, struct ofconn *ofconn, + struct ofp_flow_mod *ofm) +{ + size_t n_actions; + int error; + + error = check_ofp_message_array(&ofm->header, OFPT_FLOW_MOD, sizeof *ofm, + sizeof *ofm->actions, &n_actions); + if (error) { + return error; + } + + normalize_match(&ofm->match); + if (!ofm->match.wildcards) { + ofm->priority = htons(UINT16_MAX); + } + + error = validate_actions((const union ofp_action *) ofm->actions, + n_actions, p->max_ports); + if (error) { + return error; + } + + switch (ntohs(ofm->command)) { + case OFPFC_ADD: + return add_flow(p, ofconn, ofm, n_actions); + + case OFPFC_MODIFY: + return modify_flows_loose(p, ofm, n_actions, OFPFC_MODIFY); + + case OFPFC_MODIFY_STRICT: + return modify_flows_strict(p, ofm, n_actions, OFPFC_MODIFY); + + case OFPFC_DELETE: + return modify_flows_loose(p, ofm, n_actions, OFPFC_DELETE); + + case OFPFC_DELETE_STRICT: + return modify_flows_strict(p, ofm, n_actions, OFPFC_DELETE); + + default: + return ofp_mkerr(OFPET_FLOW_MOD_FAILED, OFPFMFC_BAD_COMMAND); + } +} + +static void +send_capability_reply(struct ofproto *p, struct ofconn *ofconn, uint32_t xid) +{ + struct ofmp_capability_reply *ocr; + struct ofpbuf *b; + char capabilities[] = "com.nicira.mgmt.manager=false\n"; + + ocr = make_openflow_xid(sizeof(*ocr), OFPT_VENDOR, xid, &b); + ocr->header.header.vendor = htonl(NX_VENDOR_ID); + ocr->header.header.subtype = htonl(NXT_MGMT); + ocr->header.type = htons(OFMPT_CAPABILITY_REPLY); + + ocr->format = htonl(OFMPCOF_SIMPLE); + ocr->mgmt_id = htonll(p->mgmt_id); + + ofpbuf_put(b, capabilities, strlen(capabilities)); + + queue_tx(b, ofconn, ofconn->reply_counter); +} + +static int +handle_ofmp(struct ofproto *p, struct ofconn *ofconn, + struct ofmp_header *ofmph) +{ + size_t msg_len = ntohs(ofmph->header.header.length); + if (msg_len < sizeof(*ofmph)) { - VLOG_WARN_RL(&rl, "dropping short managment message: %d\n", msg_len); ++ VLOG_WARN_RL(&rl, "dropping short managment message: %zu\n", msg_len); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + + if (ofmph->type == htons(OFMPT_CAPABILITY_REQUEST)) { + struct ofmp_capability_request *ofmpcr; + + if (msg_len < sizeof(struct ofmp_capability_request)) { - VLOG_WARN_RL(&rl, "dropping short capability request: %d\n", ++ VLOG_WARN_RL(&rl, "dropping short capability request: %zu\n", + msg_len); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + + ofmpcr = (struct ofmp_capability_request *)ofmph; + if (ofmpcr->format != htonl(OFMPCAF_SIMPLE)) { + /* xxx Find a better type than bad subtype */ + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); + } + + send_capability_reply(p, ofconn, ofmph->header.header.xid); + return 0; + } else { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); + } +} + +static int +handle_vendor(struct ofproto *p, struct ofconn *ofconn, void *msg) +{ + struct ofp_vendor_header *ovh = msg; + struct nicira_header *nh; + + if (ntohs(ovh->header.length) < sizeof(struct ofp_vendor_header)) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + if (ovh->vendor != htonl(NX_VENDOR_ID)) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR); + } + if (ntohs(ovh->header.length) < sizeof(struct nicira_header)) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + + nh = msg; + switch (ntohl(nh->subtype)) { + case NXT_STATUS_REQUEST: + return switch_status_handle_request(p->switch_status, ofconn->rconn, + msg); + + case NXT_ACT_SET_CONFIG: + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); /* XXX */ + + case NXT_ACT_GET_CONFIG: + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); /* XXX */ + + case NXT_COMMAND_REQUEST: + if (p->executer) { + return executer_handle_request(p->executer, ofconn->rconn, msg); + } + break; + + case NXT_MGMT: + return handle_ofmp(p, ofconn, msg); + } + + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); +} + +static void +handle_openflow(struct ofconn *ofconn, struct ofproto *p, + struct ofpbuf *ofp_msg) +{ + struct ofp_header *oh = ofp_msg->data; + int error; + + COVERAGE_INC(ofproto_recv_openflow); + switch (oh->type) { + case OFPT_ECHO_REQUEST: + error = handle_echo_request(ofconn, oh); + break; + + case OFPT_ECHO_REPLY: + error = 0; + break; + + case OFPT_FEATURES_REQUEST: + error = handle_features_request(p, ofconn, oh); + break; + + case OFPT_GET_CONFIG_REQUEST: + error = handle_get_config_request(p, ofconn, oh); + break; + + case OFPT_SET_CONFIG: + error = handle_set_config(p, ofconn, ofp_msg->data); + break; + + case OFPT_PACKET_OUT: + error = handle_packet_out(p, ofconn, ofp_msg->data); + break; + + case OFPT_PORT_MOD: + error = handle_port_mod(p, oh); + break; + + case OFPT_FLOW_MOD: + error = handle_flow_mod(p, ofconn, ofp_msg->data); + break; + + case OFPT_STATS_REQUEST: + error = handle_stats_request(p, ofconn, oh); + break; + + case OFPT_VENDOR: + error = handle_vendor(p, ofconn, ofp_msg->data); + break; + + default: + if (VLOG_IS_WARN_ENABLED()) { + char *s = ofp_to_string(oh, ntohs(oh->length), 2); + VLOG_DBG_RL(&rl, "OpenFlow message ignored: %s", s); + free(s); + } + error = ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE); + break; + } + + if (error) { + send_error_oh(ofconn, ofp_msg->data, error); + } +} + +static void +handle_odp_msg(struct ofproto *p, struct ofpbuf *packet) +{ + struct odp_msg *msg = packet->data; + uint16_t in_port = odp_port_to_ofp_port(msg->port); + struct rule *rule; + struct ofpbuf payload; + flow_t flow; + + /* Handle controller actions. */ + if (msg->type == _ODPL_ACTION_NR) { + COVERAGE_INC(ofproto_ctlr_action); + pinsched_send(p->action_sched, in_port, packet, + send_packet_in_action, p); + return; + } + + payload.data = msg + 1; + payload.size = msg->length - sizeof *msg; + flow_extract(&payload, msg->port, &flow); + + /* Check with in-band control to see if this packet should be sent + * to the local port regardless of the flow table. */ + if (in_band_msg_in_hook(p->in_band, &flow, &payload)) { + union odp_action action; + + memset(&action, 0, sizeof(action)); + action.output.type = ODPAT_OUTPUT; + action.output.port = ODPP_LOCAL; + dpif_execute(p->dpif, flow.in_port, &action, 1, &payload); + } + + rule = lookup_valid_rule(p, &flow); + if (!rule) { + /* Don't send a packet-in if OFPPC_NO_PACKET_IN asserted. */ + struct ofport *port = port_array_get(&p->ports, msg->port); + if (port) { + if (port->opp.config & OFPPC_NO_PACKET_IN) { + COVERAGE_INC(ofproto_no_packet_in); + /* XXX install 'drop' flow entry */ + ofpbuf_delete(packet); + return; + } + } else { + VLOG_WARN_RL(&rl, "packet-in on unknown port %"PRIu16, msg->port); + } + + COVERAGE_INC(ofproto_packet_in); + pinsched_send(p->miss_sched, in_port, packet, send_packet_in_miss, p); + return; + } + + if (rule->cr.wc.wildcards) { + rule = rule_create_subrule(p, rule, &flow); + rule_make_actions(p, rule, packet); + } else { + if (!rule->may_install) { + /* The rule is not installable, that is, we need to process every + * packet, so process the current packet and set its actions into + * 'subrule'. */ + rule_make_actions(p, rule, packet); + } else { + /* XXX revalidate rule if it needs it */ + } + } + + rule_execute(p, rule, &payload, &flow); + rule_reinstall(p, rule); + + if (rule->super && rule->super->cr.priority == FAIL_OPEN_PRIORITY + && rconn_is_connected(p->controller->rconn)) { + /* + * Extra-special case for fail-open mode. + * + * We are in fail-open mode and the packet matched the fail-open rule, + * but we are connected to a controller too. We should send the packet + * up to the controller in the hope that it will try to set up a flow + * and thereby allow us to exit fail-open. + * + * See the top-level comment in fail-open.c for more information. + */ + pinsched_send(p->miss_sched, in_port, packet, send_packet_in_miss, p); + } else { + ofpbuf_delete(packet); + } +} + +static void +revalidate_cb(struct cls_rule *sub_, void *cbdata_) +{ + struct rule *sub = rule_from_cls_rule(sub_); + struct revalidate_cbdata *cbdata = cbdata_; + + if (cbdata->revalidate_all + || (cbdata->revalidate_subrules && sub->super) + || (tag_set_intersects(&cbdata->revalidate_set, sub->tags))) { + revalidate_rule(cbdata->ofproto, sub); + } +} + +static bool +revalidate_rule(struct ofproto *p, struct rule *rule) +{ + const flow_t *flow = &rule->cr.flow; + + COVERAGE_INC(ofproto_revalidate_rule); + if (rule->super) { + struct rule *super; + super = rule_from_cls_rule(classifier_lookup_wild(&p->cls, flow)); + if (!super) { + rule_remove(p, rule); + return false; + } else if (super != rule->super) { + COVERAGE_INC(ofproto_revalidate_moved); + list_remove(&rule->list); + list_push_back(&super->list, &rule->list); + rule->super = super; + rule->hard_timeout = super->hard_timeout; + rule->idle_timeout = super->idle_timeout; + rule->created = super->created; + rule->used = 0; + } + } + + rule_update_actions(p, rule); + return true; +} + +static struct ofpbuf * +compose_flow_exp(const struct rule *rule, long long int now, uint8_t reason) +{ + struct ofp_flow_expired *ofe; + struct ofpbuf *buf; + + ofe = make_openflow(sizeof *ofe, OFPT_FLOW_EXPIRED, &buf); + flow_to_match(&rule->cr.flow, rule->cr.wc.wildcards, &ofe->match); + ofe->priority = htons(rule->cr.priority); + ofe->reason = reason; - ofe->duration = (now - rule->created) / 1000; - ofe->packet_count = rule->packet_count; - ofe->byte_count = rule->byte_count; ++ ofe->duration = htonl((now - rule->created) / 1000); ++ ofe->packet_count = htonll(rule->packet_count); ++ ofe->byte_count = htonll(rule->byte_count); + + return buf; +} + +static void +send_flow_exp(struct ofproto *p, struct rule *rule, + long long int now, uint8_t reason) +{ + struct ofconn *ofconn; + struct ofconn *prev; + struct ofpbuf *buf = NULL; + + /* We limit the maximum number of queued flow expirations it by accounting + * them under the counter for replies. That works because preventing + * OpenFlow requests from being processed also prevents new flows from + * being added (and expiring). (It also prevents processing OpenFlow + * requests that would not add new flows, so it is imperfect.) */ + + prev = NULL; + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + if (ofconn->send_flow_exp && rconn_is_connected(ofconn->rconn)) { + if (prev) { + queue_tx(ofpbuf_clone(buf), prev, prev->reply_counter); + } else { + buf = compose_flow_exp(rule, now, reason); + } + prev = ofconn; + } + } + if (prev) { + queue_tx(buf, prev, prev->reply_counter); + } +} + +static void +uninstall_idle_flow(struct ofproto *ofproto, struct rule *rule) +{ + assert(rule->installed); + assert(!rule->cr.wc.wildcards); + + if (rule->super) { + rule_remove(ofproto, rule); + } else { + rule_uninstall(ofproto, rule); + } +} + +static void +expire_rule(struct cls_rule *cls_rule, void *p_) +{ + struct ofproto *p = p_; + struct rule *rule = rule_from_cls_rule(cls_rule); + long long int hard_expire, idle_expire, expire, now; + + hard_expire = (rule->hard_timeout + ? rule->created + rule->hard_timeout * 1000 + : LLONG_MAX); + idle_expire = (rule->idle_timeout + && (rule->super || list_is_empty(&rule->list)) + ? rule->used + rule->idle_timeout * 1000 + : LLONG_MAX); + expire = MIN(hard_expire, idle_expire); - if (expire == LLONG_MAX) { - if (rule->installed && time_msec() >= rule->used + 5000) { - uninstall_idle_flow(p, rule); - } - return; - } + + now = time_msec(); + if (now < expire) { + if (rule->installed && now >= rule->used + 5000) { + uninstall_idle_flow(p, rule); ++ } else if (!rule->cr.wc.wildcards) { ++ active_timeout(p, rule); + } ++ + return; + } + + COVERAGE_INC(ofproto_expired); + if (rule->cr.wc.wildcards) { + /* Update stats. (This code will be a no-op if the rule expired + * due to an idle timeout, because in that case the rule has no + * subrules left.) */ + struct rule *subrule, *next; + LIST_FOR_EACH_SAFE (subrule, next, struct rule, list, &rule->list) { + rule_remove(p, subrule); + } + } + + send_flow_exp(p, rule, now, + (now >= hard_expire + ? OFPER_HARD_TIMEOUT : OFPER_IDLE_TIMEOUT)); + rule_remove(p, rule); +} + ++static void ++active_timeout(struct ofproto *ofproto, struct rule *rule) ++{ ++ if (ofproto->netflow && !is_controller_rule(rule) && ++ netflow_active_timeout_expired(ofproto->netflow, &rule->nf_flow)) { ++ struct ofexpired expired; ++ struct odp_flow odp_flow; ++ ++ /* Get updated flow stats. */ ++ memset(&odp_flow, 0, sizeof odp_flow); ++ if (rule->installed) { ++ odp_flow.key = rule->cr.flow; ++ odp_flow.flags = ODPFF_ZERO_TCP_FLAGS; ++ dpif_flow_get(ofproto->dpif, &odp_flow); ++ ++ if (odp_flow.stats.n_packets) { ++ update_time(ofproto, rule, &odp_flow.stats); ++ netflow_flow_update_flags(&rule->nf_flow, odp_flow.stats.ip_tos, ++ odp_flow.stats.tcp_flags); ++ } ++ } ++ ++ expired.flow = rule->cr.flow; ++ expired.packet_count = rule->packet_count + ++ odp_flow.stats.n_packets; ++ expired.byte_count = rule->byte_count + odp_flow.stats.n_bytes; ++ expired.used = rule->used; ++ ++ netflow_expire(ofproto->netflow, &rule->nf_flow, &expired); ++ ++ /* Schedule us to send the accumulated records once we have ++ * collected all of them. */ ++ poll_immediate_wake(); ++ } ++} ++ +static void +update_used(struct ofproto *p) +{ + struct odp_flow *flows; + size_t n_flows; + size_t i; + int error; + + error = dpif_flow_list_all(p->dpif, &flows, &n_flows); + if (error) { + return; + } + + for (i = 0; i < n_flows; i++) { + struct odp_flow *f = &flows[i]; + struct rule *rule; + + rule = rule_from_cls_rule( + classifier_find_rule_exactly(&p->cls, &f->key, 0, UINT16_MAX)); + if (!rule || !rule->installed) { + COVERAGE_INC(ofproto_unexpected_rule); + dpif_flow_del(p->dpif, f); + continue; + } + - update_time(rule, &f->stats); ++ update_time(p, rule, &f->stats); + rule_account(p, rule, f->stats.n_bytes); + } + free(flows); +} + +static void +do_send_packet_in(struct ofconn *ofconn, uint32_t buffer_id, + const struct ofpbuf *packet, int send_len) +{ + struct odp_msg *msg = packet->data; + struct ofpbuf payload; + struct ofpbuf *opi; + uint8_t reason; + + /* Extract packet payload from 'msg'. */ + payload.data = msg + 1; + payload.size = msg->length - sizeof *msg; + + /* Construct ofp_packet_in message. */ + reason = msg->type == _ODPL_ACTION_NR ? OFPR_ACTION : OFPR_NO_MATCH; + opi = make_packet_in(buffer_id, odp_port_to_ofp_port(msg->port), reason, + &payload, send_len); + + /* Send. */ + rconn_send_with_limit(ofconn->rconn, opi, ofconn->packet_in_counter, 100); +} + +static void +send_packet_in_action(struct ofpbuf *packet, void *p_) +{ + struct ofproto *p = p_; + struct ofconn *ofconn; + struct odp_msg *msg; + + msg = packet->data; + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + if (ofconn == p->controller || ofconn->miss_send_len) { + do_send_packet_in(ofconn, UINT32_MAX, packet, msg->arg); + } + } + ofpbuf_delete(packet); +} + +static void +send_packet_in_miss(struct ofpbuf *packet, void *p_) +{ + struct ofproto *p = p_; + bool in_fail_open = p->fail_open && fail_open_is_active(p->fail_open); + struct ofconn *ofconn; + struct ofpbuf payload; + struct odp_msg *msg; + + msg = packet->data; + payload.data = msg + 1; + payload.size = msg->length - sizeof *msg; + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + if (ofconn->miss_send_len) { + struct pktbuf *pb = ofconn->pktbuf; + uint32_t buffer_id = (in_fail_open + ? pktbuf_get_null() + : pktbuf_save(pb, &payload, msg->port)); + int send_len = (buffer_id != UINT32_MAX ? ofconn->miss_send_len + : UINT32_MAX); + do_send_packet_in(ofconn, buffer_id, packet, send_len); + } + } + ofpbuf_delete(packet); +} + +static uint64_t +pick_datapath_id(const struct ofproto *ofproto) +{ + const struct ofport *port; + + port = port_array_get(&ofproto->ports, ODPP_LOCAL); + if (port) { + uint8_t ea[ETH_ADDR_LEN]; + int error; + + error = netdev_get_etheraddr(port->netdev, ea); + if (!error) { + return eth_addr_to_uint64(ea); + } + VLOG_WARN("could not get MAC address for %s (%s)", + netdev_get_name(port->netdev), strerror(error)); + } + return ofproto->fallback_dpid; +} + +static uint64_t +pick_fallback_dpid(void) +{ + uint8_t ea[ETH_ADDR_LEN]; + eth_addr_random(ea); + ea[0] = 0x00; /* Set Nicira OUI. */ + ea[1] = 0x23; + ea[2] = 0x20; + return eth_addr_to_uint64(ea); +} + +static bool +default_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet, + struct odp_actions *actions, tag_type *tags, - void *ofproto_) ++ uint16_t *nf_output_iface, void *ofproto_) +{ + struct ofproto *ofproto = ofproto_; + int out_port; + + /* Drop frames for reserved multicast addresses. */ + if (eth_addr_is_reserved(flow->dl_dst)) { + return true; + } + + /* Learn source MAC (but don't try to learn from revalidation). */ + if (packet != NULL) { + tag_type rev_tag = mac_learning_learn(ofproto->ml, flow->dl_src, + 0, flow->in_port); + if (rev_tag) { + /* The log messages here could actually be useful in debugging, + * so keep the rate limit relatively high. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300); + VLOG_DBG_RL(&rl, "learned that "ETH_ADDR_FMT" is on port %"PRIu16, + ETH_ADDR_ARGS(flow->dl_src), flow->in_port); + ofproto_revalidate(ofproto, rev_tag); + } + } + + /* Determine output port. */ + out_port = mac_learning_lookup_tag(ofproto->ml, flow->dl_dst, 0, tags); + if (out_port < 0) { - add_output_group_action(actions, DP_GROUP_FLOOD); ++ add_output_group_action(actions, DP_GROUP_FLOOD, nf_output_iface); + } else if (out_port != flow->in_port) { + odp_actions_add(actions, ODPAT_OUTPUT)->output.port = out_port; ++ *nf_output_iface = out_port; + } else { + /* Drop. */ + } + + return true; +} + +static const struct ofhooks default_ofhooks = { + NULL, + default_normal_ofhook_cb, + NULL, + NULL +}; diff --cc ofproto/ofproto.h index 398cac4f,00000000..50dd5d5b mode 100644,000000..100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@@ -1,111 -1,0 +1,110 @@@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OFPROTO_H +#define OFPROTO_H 1 + +#include +#include +#include +#include "flow.h" ++#include "netflow.h" +#include "tag.h" + +struct odp_actions; +struct ofhooks; +struct ofproto; +struct svec; + +struct ofexpired { + flow_t flow; - uint64_t packet_count; /* Packets from *expired* subrules. */ - uint64_t byte_count; /* Bytes from *expired* subrules. */ ++ uint64_t packet_count; /* Packets from subrules. */ ++ uint64_t byte_count; /* Bytes from subrules. */ + long long int used; /* Last-used time (0 if never used). */ - long long int created; /* Creation time. */ - uint8_t tcp_flags; /* Bitwise-OR of all TCP flags seen. */ - uint8_t ip_tos; /* Last-seen IP type-of-service. */ +}; + +int ofproto_create(const char *datapath, const struct ofhooks *, void *aux, + struct ofproto **ofprotop); +void ofproto_destroy(struct ofproto *); +int ofproto_run(struct ofproto *); +int ofproto_run1(struct ofproto *); +int ofproto_run2(struct ofproto *, bool revalidate_all); +void ofproto_wait(struct ofproto *); +bool ofproto_is_alive(const struct ofproto *); + +/* Configuration. */ +void ofproto_set_datapath_id(struct ofproto *, uint64_t datapath_id); +void ofproto_set_mgmt_id(struct ofproto *, uint64_t mgmt_id); +void ofproto_set_probe_interval(struct ofproto *, int probe_interval); +void ofproto_set_max_backoff(struct ofproto *, int max_backoff); +void ofproto_set_desc(struct ofproto *, + const char *manufacturer, const char *hardware, + const char *software, const char *serial); +int ofproto_set_in_band(struct ofproto *, bool in_band); +int ofproto_set_discovery(struct ofproto *, bool discovery, + const char *accept_controller_re, + bool update_resolv_conf); +int ofproto_set_controller(struct ofproto *, const char *controller); +int ofproto_set_listeners(struct ofproto *, const struct svec *listeners); +int ofproto_set_snoops(struct ofproto *, const struct svec *snoops); - int ofproto_set_netflow(struct ofproto *, const struct svec *collectors, - uint8_t engine_type, uint8_t engine_id, bool add_id_to_iface); ++int ofproto_set_netflow(struct ofproto *, ++ const struct netflow_options *nf_options); +void ofproto_set_failure(struct ofproto *, bool fail_open); +void ofproto_set_rate_limit(struct ofproto *, int rate_limit, int burst_limit); +int ofproto_set_stp(struct ofproto *, bool enable_stp); +int ofproto_set_remote_execution(struct ofproto *, const char *command_acl, + const char *command_dir); + +/* Configuration querying. */ +uint64_t ofproto_get_datapath_id(const struct ofproto *); +uint64_t ofproto_get_mgmt_id(const struct ofproto *); +int ofproto_get_probe_interval(const struct ofproto *); +int ofproto_get_max_backoff(const struct ofproto *); +bool ofproto_get_in_band(const struct ofproto *); +bool ofproto_get_discovery(const struct ofproto *); +const char *ofproto_get_controller(const struct ofproto *); +void ofproto_get_listeners(const struct ofproto *, struct svec *); +void ofproto_get_snoops(const struct ofproto *, struct svec *); +void ofproto_get_all_flows(struct ofproto *p, struct ds *); + +/* Functions for use by ofproto implementation modules, not by clients. */ +int ofproto_send_packet(struct ofproto *, const flow_t *, + const union ofp_action *, size_t n_actions, + const struct ofpbuf *); +void ofproto_add_flow(struct ofproto *, const flow_t *, uint32_t wildcards, + unsigned int priority, + const union ofp_action *, size_t n_actions, + int idle_timeout); +void ofproto_delete_flow(struct ofproto *, const flow_t *, uint32_t wildcards, + unsigned int priority); +void ofproto_flush_flows(struct ofproto *); + +/* Hooks for ovs-vswitchd. */ +struct ofhooks { + void (*port_changed_cb)(enum ofp_port_reason, const struct ofp_phy_port *, + void *aux); + bool (*normal_cb)(const flow_t *, const struct ofpbuf *packet, - struct odp_actions *, tag_type *, void *aux); ++ struct odp_actions *, tag_type *, ++ uint16_t *nf_output_iface, void *aux); + void (*account_flow_cb)(const flow_t *, const union odp_action *, + size_t n_actions, unsigned long long int n_bytes, + void *aux); + void (*account_checkpoint_cb)(void *aux); +}; +void ofproto_revalidate(struct ofproto *, tag_type); +struct tag_set *ofproto_get_revalidate_set(struct ofproto *); + +#endif /* ofproto.h */ diff --cc utilities/ovs-openflowd.c index 603e2587,00000000..20da572b mode 100644,000000..100644 --- a/utilities/ovs-openflowd.c +++ b/utilities/ovs-openflowd.c @@@ -1,565 -1,0 +1,568 @@@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "command-line.h" +#include "compiler.h" +#include "daemon.h" +#include "dirs.h" +#include "dpif.h" +#include "fault.h" +#include "leak-checker.h" +#include "list.h" +#include "netdev.h" +#include "ofpbuf.h" +#include "ofproto/ofproto.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "poll-loop.h" +#include "rconn.h" +#include "svec.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" +#include "vconn-ssl.h" +#include "vconn.h" + +#include "vlog.h" +#define THIS_MODULE VLM_openflowd + +/* Behavior when the connection to the controller fails. */ +enum fail_mode { + FAIL_OPEN, /* Act as learning switch. */ + FAIL_CLOSED /* Drop all packets. */ +}; + +/* Settings that may be configured by the user. */ +struct ofsettings { + /* Overall mode of operation. */ + bool discovery; /* Discover the controller automatically? */ + bool in_band; /* Connect to controller in-band? */ + + /* Datapath. */ + uint64_t datapath_id; /* Datapath ID. */ + const char *dp_name; /* Name of local datapath. */ + + /* Description strings. */ + const char *mfr_desc; /* Manufacturer. */ + const char *hw_desc; /* Hardware. */ + const char *sw_desc; /* Software version. */ + const char *serial_desc; /* Serial number. */ + + /* Related vconns and network devices. */ + const char *controller_name; /* Controller (if not discovery mode). */ + struct svec listeners; /* Listen for management connections. */ + struct svec snoops; /* Listen for controller snooping conns. */ + + /* Failure behavior. */ + enum fail_mode fail_mode; /* Act as learning switch if no controller? */ + int max_idle; /* Idle time for flows in fail-open mode. */ + int probe_interval; /* # seconds idle before sending echo request. */ + int max_backoff; /* Max # seconds between connection attempts. */ + + /* Packet-in rate-limiting. */ + int rate_limit; /* Tokens added to bucket per second. */ + int burst_limit; /* Maximum number token bucket size. */ + + /* Discovery behavior. */ + const char *accept_controller_re; /* Controller vconns to accept. */ + bool update_resolv_conf; /* Update /etc/resolv.conf? */ + + /* Spanning tree protocol. */ + bool enable_stp; + + /* Remote command execution. */ + char *command_acl; /* Command white/blacklist, as shell globs. */ + char *command_dir; /* Directory that contains commands. */ + + /* Management. */ + uint64_t mgmt_id; /* Management ID. */ + + /* NetFlow. */ + struct svec netflow; /* NetFlow targets. */ +}; + +static void parse_options(int argc, char *argv[], struct ofsettings *); +static void usage(void) NO_RETURN; + +int +main(int argc, char *argv[]) +{ + struct unixctl_server *unixctl; + struct ofproto *ofproto; + struct ofsettings s; + int error; ++ struct netflow_options nf_options; + + set_program_name(argv[0]); + register_fault_handlers(); + time_init(); + vlog_init(); + parse_options(argc, argv, &s); + signal(SIGPIPE, SIG_IGN); + + die_if_already_running(); + daemonize(); + + /* Start listening for ovs-appctl requests. */ + error = unixctl_server_create(NULL, &unixctl); + if (error) { + ovs_fatal(error, "Could not listen for unixctl connections"); + } + + VLOG_INFO("Open vSwitch version %s", VERSION BUILDNR); + VLOG_INFO("OpenFlow protocol version 0x%02x", OFP_VERSION); + + /* Start OpenFlow processing. */ + error = ofproto_create(s.dp_name, NULL, NULL, &ofproto); + if (error) { + ovs_fatal(error, "could not initialize openflow switch"); + } + error = ofproto_set_in_band(ofproto, s.in_band); + if (error) { + ovs_fatal(error, "failed to configure in-band control"); + } + error = ofproto_set_discovery(ofproto, s.discovery, s.accept_controller_re, + s.update_resolv_conf); + if (error) { + ovs_fatal(error, "failed to configure controller discovery"); + } + if (s.datapath_id) { + ofproto_set_datapath_id(ofproto, s.datapath_id); + } + if (s.mgmt_id) { + ofproto_set_mgmt_id(ofproto, s.mgmt_id); + } + ofproto_set_desc(ofproto, s.mfr_desc, s.hw_desc, s.sw_desc, s.serial_desc); + error = ofproto_set_listeners(ofproto, &s.listeners); + if (error) { + ovs_fatal(error, "failed to configure management connections"); + } + error = ofproto_set_snoops(ofproto, &s.snoops); + if (error) { + ovs_fatal(error, + "failed to configure controller snooping connections"); + } - error = ofproto_set_netflow(ofproto, &s.netflow, 0, 0, false); ++ memset(&nf_options, 0, sizeof nf_options); ++ nf_options.collectors = s.netflow; ++ error = ofproto_set_netflow(ofproto, &nf_options); + if (error) { + ovs_fatal(error, "failed to configure NetFlow collectors"); + } + ofproto_set_failure(ofproto, s.fail_mode == FAIL_OPEN); + ofproto_set_probe_interval(ofproto, s.probe_interval); + ofproto_set_max_backoff(ofproto, s.max_backoff); + ofproto_set_rate_limit(ofproto, s.rate_limit, s.burst_limit); + error = ofproto_set_stp(ofproto, s.enable_stp); + if (error) { + ovs_fatal(error, "failed to configure STP"); + } + error = ofproto_set_remote_execution(ofproto, s.command_acl, + s.command_dir); + if (error) { + ovs_fatal(error, "failed to configure remote command execution"); + } + if (!s.discovery) { + error = ofproto_set_controller(ofproto, s.controller_name); + if (error) { + ovs_fatal(error, "failed to configure controller"); + } + } + + while (ofproto_is_alive(ofproto)) { + error = ofproto_run(ofproto); + if (error) { + ovs_fatal(error, "unrecoverable datapath error"); + } + unixctl_server_run(unixctl); + dp_run(); + netdev_run(); + + ofproto_wait(ofproto); + unixctl_server_wait(unixctl); + dp_wait(); + netdev_wait(); + poll_block(); + } + + return 0; +} + +/* User interface. */ + +static void +parse_options(int argc, char *argv[], struct ofsettings *s) +{ + enum { + OPT_DATAPATH_ID = UCHAR_MAX + 1, + OPT_MANUFACTURER, + OPT_HARDWARE, + OPT_SOFTWARE, + OPT_SERIAL, + OPT_ACCEPT_VCONN, + OPT_NO_RESOLV_CONF, + OPT_BR_NAME, + OPT_FAIL_MODE, + OPT_INACTIVITY_PROBE, + OPT_MAX_IDLE, + OPT_MAX_BACKOFF, + OPT_SNOOP, + OPT_RATE_LIMIT, + OPT_BURST_LIMIT, + OPT_BOOTSTRAP_CA_CERT, + OPT_STP, + OPT_NO_STP, + OPT_OUT_OF_BAND, + OPT_IN_BAND, + OPT_COMMAND_ACL, + OPT_COMMAND_DIR, + OPT_NETFLOW, + OPT_MGMT_ID, + VLOG_OPTION_ENUMS, + LEAK_CHECKER_OPTION_ENUMS + }; + static struct option long_options[] = { + {"datapath-id", required_argument, 0, OPT_DATAPATH_ID}, + {"manufacturer", required_argument, 0, OPT_MANUFACTURER}, + {"hardware", required_argument, 0, OPT_HARDWARE}, + {"software", required_argument, 0, OPT_SOFTWARE}, + {"serial", required_argument, 0, OPT_SERIAL}, + {"accept-vconn", required_argument, 0, OPT_ACCEPT_VCONN}, + {"no-resolv-conf", no_argument, 0, OPT_NO_RESOLV_CONF}, + {"config", required_argument, 0, 'F'}, + {"br-name", required_argument, 0, OPT_BR_NAME}, + {"fail", required_argument, 0, OPT_FAIL_MODE}, + {"inactivity-probe", required_argument, 0, OPT_INACTIVITY_PROBE}, + {"max-idle", required_argument, 0, OPT_MAX_IDLE}, + {"max-backoff", required_argument, 0, OPT_MAX_BACKOFF}, + {"listen", required_argument, 0, 'l'}, + {"snoop", required_argument, 0, OPT_SNOOP}, + {"rate-limit", optional_argument, 0, OPT_RATE_LIMIT}, + {"burst-limit", required_argument, 0, OPT_BURST_LIMIT}, + {"stp", no_argument, 0, OPT_STP}, + {"no-stp", no_argument, 0, OPT_NO_STP}, + {"out-of-band", no_argument, 0, OPT_OUT_OF_BAND}, + {"in-band", no_argument, 0, OPT_IN_BAND}, + {"command-acl", required_argument, 0, OPT_COMMAND_ACL}, + {"command-dir", required_argument, 0, OPT_COMMAND_DIR}, + {"netflow", required_argument, 0, OPT_NETFLOW}, + {"mgmt-id", required_argument, 0, OPT_MGMT_ID}, + {"verbose", optional_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + LEAK_CHECKER_LONG_OPTIONS, +#ifdef HAVE_OPENSSL + VCONN_SSL_LONG_OPTIONS + {"bootstrap-ca-cert", required_argument, 0, OPT_BOOTSTRAP_CA_CERT}, +#endif + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + /* Set defaults that we can figure out before parsing options. */ + s->datapath_id = 0; + s->mfr_desc = NULL; + s->hw_desc = NULL; + s->sw_desc = NULL; + s->serial_desc = NULL; + svec_init(&s->listeners); + svec_init(&s->snoops); + s->fail_mode = FAIL_OPEN; + s->max_idle = 0; + s->probe_interval = 0; + s->max_backoff = 8; + s->update_resolv_conf = true; + s->rate_limit = 0; + s->burst_limit = 0; + s->accept_controller_re = NULL; + s->enable_stp = false; + s->in_band = true; + s->command_acl = ""; + s->command_dir = NULL; + svec_init(&s->netflow); + s->mgmt_id = 0; + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case OPT_DATAPATH_ID: + if (strlen(optarg) != 12 + || strspn(optarg, "0123456789abcdefABCDEF") != 12) { + ovs_fatal(0, "argument to --datapath-id must be " + "exactly 12 hex digits"); + } + s->datapath_id = strtoll(optarg, NULL, 16); + if (!s->datapath_id) { + ovs_fatal(0, "argument to --datapath-id must be nonzero"); + } + break; + + case OPT_MANUFACTURER: + s->mfr_desc = optarg; + break; + + case OPT_HARDWARE: + s->hw_desc = optarg; + break; + + case OPT_SOFTWARE: + s->sw_desc = optarg; + break; + + case OPT_SERIAL: + s->serial_desc = optarg; + break; + + case OPT_ACCEPT_VCONN: + s->accept_controller_re = optarg; + break; + + case OPT_NO_RESOLV_CONF: + s->update_resolv_conf = false; + break; + + case OPT_FAIL_MODE: + if (!strcmp(optarg, "open")) { + s->fail_mode = FAIL_OPEN; + } else if (!strcmp(optarg, "closed")) { + s->fail_mode = FAIL_CLOSED; + } else { + ovs_fatal(0, "--fail argument must be \"open\" or \"closed\""); + } + break; + + case OPT_INACTIVITY_PROBE: + s->probe_interval = atoi(optarg); + if (s->probe_interval < 5) { + ovs_fatal(0, "--inactivity-probe argument must be at least 5"); + } + break; + + case OPT_MAX_IDLE: + if (!strcmp(optarg, "permanent")) { + s->max_idle = OFP_FLOW_PERMANENT; + } else { + s->max_idle = atoi(optarg); + if (s->max_idle < 1 || s->max_idle > 65535) { + ovs_fatal(0, "--max-idle argument must be between 1 and " + "65535 or the word 'permanent'"); + } + } + break; + + case OPT_MAX_BACKOFF: + s->max_backoff = atoi(optarg); + if (s->max_backoff < 1) { + ovs_fatal(0, "--max-backoff argument must be at least 1"); + } else if (s->max_backoff > 3600) { + s->max_backoff = 3600; + } + break; + + case OPT_RATE_LIMIT: + if (optarg) { + s->rate_limit = atoi(optarg); + if (s->rate_limit < 1) { + ovs_fatal(0, "--rate-limit argument must be at least 1"); + } + } else { + s->rate_limit = 1000; + } + break; + + case OPT_BURST_LIMIT: + s->burst_limit = atoi(optarg); + if (s->burst_limit < 1) { + ovs_fatal(0, "--burst-limit argument must be at least 1"); + } + break; + + case OPT_STP: + s->enable_stp = true; + break; + + case OPT_NO_STP: + s->enable_stp = false; + break; + + case OPT_OUT_OF_BAND: + s->in_band = false; + break; + + case OPT_IN_BAND: + s->in_band = true; + break; + + case OPT_COMMAND_ACL: + s->command_acl = (s->command_acl[0] + ? xasprintf("%s,%s", s->command_acl, optarg) + : optarg); + break; + + case OPT_COMMAND_DIR: + s->command_dir = optarg; + break; + + case OPT_NETFLOW: + svec_add(&s->netflow, optarg); + break; + + case OPT_MGMT_ID: + if (strlen(optarg) != 12 + || strspn(optarg, "0123456789abcdefABCDEF") != 12) { + ovs_fatal(0, "argument to --mgmt-id must be " + "exactly 12 hex digits"); + } + s->mgmt_id = strtoll(optarg, NULL, 16); + if (!s->mgmt_id) { + ovs_fatal(0, "argument to --mgmt-id must be nonzero"); + } + break; + + case 'l': + svec_add(&s->listeners, optarg); + break; + + case OPT_SNOOP: + svec_add(&s->snoops, optarg); + break; + + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION); + exit(EXIT_SUCCESS); + + DAEMON_OPTION_HANDLERS + + VLOG_OPTION_HANDLERS + + LEAK_CHECKER_OPTION_HANDLERS + +#ifdef HAVE_OPENSSL + VCONN_SSL_OPTION_HANDLERS + + case OPT_BOOTSTRAP_CA_CERT: + vconn_ssl_set_ca_cert_file(optarg, true); + break; +#endif + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); + + argc -= optind; + argv += optind; + if (argc < 1 || argc > 2) { + ovs_fatal(0, "need one or two non-option arguments; " + "use --help for usage"); + } + + /* Local and remote vconns. */ + s->dp_name = argv[0]; + s->controller_name = argc > 1 ? xstrdup(argv[1]) : NULL; + + /* Set accept_controller_regex. */ + if (!s->accept_controller_re) { + s->accept_controller_re + = vconn_ssl_is_configured() ? "^ssl:.*" : "^tcp:.*"; + } + + /* Mode of operation. */ + s->discovery = s->controller_name == NULL; + if (s->discovery && !s->in_band) { + ovs_fatal(0, "Cannot perform discovery with out-of-band control"); + } + + /* Rate limiting. */ + if (s->rate_limit && s->rate_limit < 100) { + VLOG_WARN("Rate limit set to unusually low value %d", s->rate_limit); + } +} + +static void +usage(void) +{ + printf("%s: an OpenFlow switch implementation.\n" + "usage: %s [OPTIONS] DATAPATH [CONTROLLER]\n" + "DATAPATH is a local datapath (e.g. \"dp0\").\n" + "CONTROLLER is an active OpenFlow connection method; if it is\n" + "omitted, then ovs-openflowd performs controller discovery.\n", + program_name, program_name); + vconn_usage(true, true, true); + printf("\nOpenFlow options:\n" + " -d, --datapath-id=ID Use ID as the OpenFlow switch ID\n" + " (ID must consist of 12 hex digits)\n" + " --mgmt-id=ID Use ID as the management ID\n" + " (ID must consist of 12 hex digits)\n" + " --manufacturer=MFR Identify manufacturer as MFR\n" + " --hardware=HW Identify hardware as HW\n" + " --software=SW Identify software as SW\n" + " --serial=SERIAL Identify serial number as SERIAL\n" + "\nController discovery options:\n" + " --accept-vconn=REGEX accept matching discovered controllers\n" + " --no-resolv-conf do not update /etc/resolv.conf\n" + "\nNetworking options:\n" + " --fail=open|closed when controller connection fails:\n" + " closed: drop all packets\n" + " open (default): act as learning switch\n" + " --inactivity-probe=SECS time between inactivity probes\n" + " --max-idle=SECS max idle for flows set up by switch\n" + " --max-backoff=SECS max time between controller connection\n" + " attempts (default: 8 seconds)\n" + " -l, --listen=METHOD allow management connections on METHOD\n" + " (a passive OpenFlow connection method)\n" + " --snoop=METHOD allow controller snooping on METHOD\n" + " (a passive OpenFlow connection method)\n" + " --out-of-band controller connection is out-of-band\n" + " --netflow=HOST:PORT configure NetFlow output target\n" + "\nRate-limiting of \"packet-in\" messages to the controller:\n" + " --rate-limit[=PACKETS] max rate, in packets/s (default: 1000)\n" + " --burst-limit=BURST limit on packet credit for idle time\n" + "\nRemote command execution options:\n" + " --command-acl=[!]GLOB[,[!]GLOB...] set allowed/denied commands\n" + " --command-dir=DIR set command dir (default: %s/commands)\n", + ovs_pkgdatadir); + daemon_usage(); + vlog_usage(); + printf("\nOther options:\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + leak_checker_usage(); + exit(EXIT_SUCCESS); +} diff --cc vswitchd/bridge.c index b2c051d0,1e55ad2f..4e272248 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@@ -43,7 -43,6 +43,8 @@@ #include "odp-util.h" #include "ofp-print.h" #include "ofpbuf.h" ++#include "ofproto/netflow.h" +#include "ofproto/ofproto.h" #include "packets.h" #include "poll-loop.h" #include "port-array.h" @@@ -567,22 -494,41 +569,20 @@@ bridge_reconfigure(void LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { uint8_t ea[8]; uint64_t dpid; - struct iface *local_iface = NULL; - const char *devname; + struct iface *local_iface; + struct iface *hw_addr_iface; - uint8_t engine_type, engine_id; - bool add_id_to_iface = false; - struct svec nf_hosts; + struct netflow_options nf_options; bridge_fetch_dp_ifaces(br); - for (i = 0; i < br->n_ports; ) { - struct port *port = br->ports[i]; + iterate_and_prune_ifaces(br, init_iface_netdev, NULL); - for (j = 0; j < port->n_ifaces; ) { - struct iface *iface = port->ifaces[j]; - if (iface->dp_ifidx < 0) { - VLOG_ERR("%s interface not in dp%u, dropping", - iface->name, dpif_id(&br->dpif)); - iface_destroy(iface); - } else { - if (iface->dp_ifidx == ODPP_LOCAL) { - local_iface = iface; - } - VLOG_DBG("dp%u has interface %s on port %d", - dpif_id(&br->dpif), iface->name, iface->dp_ifidx); - j++; - } - } - if (!port->n_ifaces) { - VLOG_ERR("%s port has no interfaces, dropping", port->name); - port_destroy(port); - continue; - } - i++; - } + iterate_and_prune_ifaces(br, check_iface_dp_ifidx, NULL); /* Pick local port hardware address, datapath ID. */ - bridge_pick_local_hw_addr(br, ea, &devname); + bridge_pick_local_hw_addr(br, ea, &hw_addr_iface); + local_iface = bridge_get_local_iface(br); if (local_iface) { - int error = netdev_nodev_set_etheraddr(local_iface->name, ea); + int error = netdev_set_etheraddr(local_iface->netdev, ea); if (error) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); VLOG_ERR_RL(&rl, "bridge %s: failed to set bridge " @@@ -595,9 -541,13 +595,13 @@@ ofproto_set_datapath_id(br->ofproto, dpid); /* Set NetFlow configuration on this bridge. */ - dpif_get_netflow_ids(br->dpif, &engine_type, &engine_id); + memset(&nf_options, 0, sizeof nf_options); - nf_options.engine_type = br->dpif.minor; - nf_options.engine_id = br->dpif.minor; ++ dpif_get_netflow_ids(br->dpif, &nf_options.engine_type, ++ &nf_options.engine_id); + nf_options.active_timeout = -1; + if (cfg_has("netflow.%s.engine-type", br->name)) { - engine_type = cfg_get_int(0, "netflow.%s.engine-type", + nf_options.engine_type = cfg_get_int(0, "netflow.%s.engine-type", br->name); } if (cfg_has("netflow.%s.engine-id", br->name)) { @@@ -899,30 -859,8 +907,28 @@@ bridge_flush(struct bridge *br { COVERAGE_INC(bridge_flush); br->flush = true; - if (br->ml) { - mac_learning_flush(br->ml); - } + mac_learning_flush(br->ml); } + +/* Returns the 'br' interface for the ODPP_LOCAL port, or null if 'br' has no + * such interface. */ +static struct iface * +bridge_get_local_iface(struct bridge *br) +{ + size_t i, j; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (iface->dp_ifidx == ODPP_LOCAL) { + return iface; + } + } + } + + return NULL; +} /* Bridge unixctl user interface functions. */ static void diff --cc vswitchd/ovs-vswitchd.conf.5.in index 4d4bb482,ef9d7597..c4166786 --- a/vswitchd/ovs-vswitchd.conf.5.in +++ b/vswitchd/ovs-vswitchd.conf.5.in @@@ -419,12 -423,18 +422,18 @@@ collector in the form \fIip\fB:\fIport\ will be sent to each \fIip\fR on UDP \fIport\fR. The \fIip\fR must be specified numerically, not as a DNS name. - The NetFlow messages will use the datapath index for the engine type and id. - This can be overridden with the \fBnetflow.\fIbridge\fB.engine-type\fR and + In addition to terminating flows, NetFlow can also send records at a set + interval for flows that are still active. This interval can be configured + by defining the key \fBnetflow.\fIbridge\fB\.active-timeout\fR. The value + is in seconds. An active timeout of 0 will disable this functionality. By + default there is timeout value of 600 seconds. + + The NetFlow messages will use the datapath index for the engine type and id. + This can be overridden with the \fBnetflow.\fIbridge\fB.engine-type\fR and \fBnetflow.\fIbridge\fB.engine-id\fR, respectively. Each takes a value - between 0 and 255, inclusive. + between 0 and 255, inclusive. -Many NetFlow collectors do not expect multiple virtual switches to be +Many NetFlow collectors do not expect multiple switches to be sending messages from the same host, and they do not store the engine information which could be used to disambiguate the traffic. To prevent flows from multiple switches appearing as if they came on the interface, diff --cc xenserver/usr_share_vswitch_scripts_dump-vif-details index b0ceb405,00000000..7ce8bf78 mode 100755,000000..100755 --- a/xenserver/usr_share_vswitch_scripts_dump-vif-details +++ b/xenserver/usr_share_vswitch_scripts_dump-vif-details @@@ -1,78 -1,0 +1,77 @@@ +#!/usr/bin/python +# +# Script to retrieve extended information about VIFs that are +# needed by the controller. This is called by the "vif" script, +# which is run when virtual interfaces are added and removed. + +# Copyright (C) 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without warranty of any kind. + +import sys +import XenAPI +import xen.lowlevel.xs + +# Query XenStore for the opaque reference of this vif +def get_vif_ref(domid, devid): + xenstore = xen.lowlevel.xs.xs() + t = xenstore.transaction_start() + vif_ref = xenstore.read(t, '/xapi/%s/private/vif/%s/ref' % (domid, devid)) + xenstore.transaction_end(t) + return vif_ref + +# Query XAPI for the information we need using the vif's opaque reference +def dump_vif_info(domid, devid, vif_ref): ++ vif_info = [] ++ session = XenAPI.xapi_local() ++ session.xenapi.login_with_password("root", "") + try: - session = XenAPI.xapi_local() - session.xenapi.login_with_password("root", "") + vif_rec = session.xenapi.VIF.get_record(vif_ref) + net_rec = session.xenapi.network.get_record(vif_rec["network"]) - vm_rec = session.xenapi.VM.get_record(vif_rec["VM"]) ++ vm_uuid = session.xenapi.VM.get_uuid(vif_rec["VM"]) + + # Data to allow vNetManager to associate VIFs with xapi data - sys.stdout.write('--add=port.vif%s.%s.net-uuid=%s ' - % (domid, devid, net_rec["uuid"])) - sys.stdout.write('--add=port.vif%s.%s.vif-mac=%s ' - % (domid, devid, vif_rec["MAC"])) - sys.stdout.write('--add=port.vif%s.%s.vif-uuid=%s ' - % (domid, devid, vif_rec["uuid"])) - sys.stdout.write('--add=port.vif%s.%s.vm-uuid=%s ' - % (domid, devid, vm_rec["uuid"])) ++ add_port = '--add=port.vif%s.%s' % (domid, devid) ++ vif_info.append('%s.net-uuid=%s' % (add_port, net_rec["uuid"])) ++ vif_info.append('%s.vif-mac=%s' % (add_port, vif_rec["MAC"])) ++ vif_info.append('%s.vif-uuid=%s' % (add_port, vif_rec["uuid"])) ++ vif_info.append('%s.vm-uuid=%s' % (add_port, vm_uuid)) + + # vNetManager needs to know the network UUID(s) associated with + # each datapath. Normally interface-reconfigure adds them, but + # interface-reconfigure never gets called for internal networks + # (xapi does the addbr ioctl internally), so we have to do it + # here instead for internal networks. This is only acceptable + # because xapi is lazy about creating internal networks: it + # only creates one just before it adds the first vif to it. + # There may still be a brief delay between the initial + # ovs-vswitchd connection to vNetManager and setting this + # configuration variable, but vNetManager can tolerate that. - if len(net_rec['PIFs']) == 0: ++ if not net_rec['PIFs']: + key = 'bridge.%s.xs-network-uuids' % net_rec['bridge'] + value = net_rec['uuid'] - sys.stdout.write('--del-match=%s=* ' % key) - sys.stdout.write('--add=%s=%s ' % (key, value)) ++ vif_info.append('--del-match=%s=*' % key) ++ vif_info.append('--add=%s=%s' % (key, value)) + finally: + session.xenapi.session.logout() ++ print ' '.join(vif_info) + +if __name__ == '__main__': - if (len(sys.argv) != 3): - sys.stderr.write("ERROR: %s \n") ++ if len(sys.argv) != 3: ++ sys.stderr.write("ERROR: %s \n" % sys.argv[0]) + sys.exit(1) + + domid = sys.argv[1] + devid = sys.argv[2] + + vif_ref = get_vif_ref(domid, devid) + if not vif_ref: + sys.stderr.write("ERROR: Could not find interface vif%s.%s\n" - % (domid, devid)) ++ % (domid, devid)) + sys.exit(1) + + dump_vif_info(domid, devid, vif_ref) + sys.exit(0)