+ DPIF_PORT_FOR_EACH (&port, &port_dump, &dpif->dpif) {
+ uint32_t upcall_pid = dpif_linux_port_get_pid(dpif_, port.port_no);
+ struct dpif_linux_vport vport_request;
+
+ dpif_linux_vport_init(&vport_request);
+ vport_request.cmd = OVS_VPORT_CMD_SET;
+ vport_request.dp_ifindex = dpif->dp_ifindex;
+ vport_request.port_no = port.port_no;
+ vport_request.upcall_pid = &upcall_pid;
+ error = dpif_linux_vport_transact(&vport_request, NULL, NULL);
+ if (!error) {
+ VLOG_DBG("%s: assigning port %"PRIu32" to netlink pid %"PRIu32,
+ dpif_name(&dpif->dpif), vport_request.port_no,
+ upcall_pid);
+ } else {
+ VLOG_WARN_RL(&error_rl, "%s: failed to set upcall pid on port: %s",
+ dpif_name(&dpif->dpif), strerror(error));
+ }
+ }
+}
+
+static int
+dpif_linux_recv_set(struct dpif *dpif_, bool enable)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ if ((dpif->epoll_fd >= 0) == enable) {
+ return 0;
+ }
+
+ if (!enable) {
+ destroy_channels(dpif);
+ } else {
+ struct dpif_channel *ch;
+ int error;
+
+ dpif->epoll_fd = epoll_create(N_CHANNELS);
+ if (dpif->epoll_fd < 0) {
+ return errno;
+ }
+
+ for (ch = dpif->channels; ch < &dpif->channels[N_CHANNELS]; ch++) {
+ int indx = ch - dpif->channels;
+ struct epoll_event event;
+
+ error = nl_sock_create(NETLINK_GENERIC, &ch->sock);
+ if (error) {
+ destroy_channels(dpif);
+ return error;
+ }
+
+ memset(&event, 0, sizeof event);
+ event.events = EPOLLIN;
+ event.data.u32 = indx;
+ if (epoll_ctl(dpif->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(ch->sock),
+ &event) < 0) {
+ error = errno;
+ destroy_channels(dpif);
+ return error;
+ }
+
+ memset(ch->sketches, 0, sizeof ch->sketches);
+ ch->last_poll = LLONG_MIN;
+ }
+
+ dpif->ready_mask = 0;
+ dpif->next_scale = time_msec() + SCALE_INTERVAL;
+ }
+
+ set_upcall_pids(dpif_);
+
+ return 0;
+}
+
+static int
+dpif_linux_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
+ uint32_t queue_id, uint32_t *priority)
+{
+ if (queue_id < 0xf000) {
+ *priority = TC_H_MAKE(1 << 16, queue_id + 1);
+ return 0;
+ } else {
+ return EINVAL;
+ }
+}
+
+static int
+parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
+ int *dp_ifindex)
+{
+ static const struct nl_policy ovs_packet_policy[] = {
+ /* Always present. */
+ [OVS_PACKET_ATTR_PACKET] = { .type = NL_A_UNSPEC,
+ .min_len = ETH_HEADER_LEN },
+ [OVS_PACKET_ATTR_KEY] = { .type = NL_A_NESTED },
+
+ /* OVS_PACKET_CMD_ACTION only. */
+ [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_U64, .optional = true },
+ };
+
+ struct ovs_header *ovs_header;
+ struct nlattr *a[ARRAY_SIZE(ovs_packet_policy)];
+ struct nlmsghdr *nlmsg;
+ struct genlmsghdr *genl;
+ struct ofpbuf b;
+ int type;
+
+ ofpbuf_use_const(&b, buf->data, buf->size);
+
+ nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
+ genl = ofpbuf_try_pull(&b, sizeof *genl);
+ ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
+ if (!nlmsg || !genl || !ovs_header
+ || nlmsg->nlmsg_type != ovs_packet_family
+ || !nl_policy_parse(&b, 0, ovs_packet_policy, a,
+ ARRAY_SIZE(ovs_packet_policy))) {
+ return EINVAL;
+ }
+
+ type = (genl->cmd == OVS_PACKET_CMD_MISS ? DPIF_UC_MISS
+ : genl->cmd == OVS_PACKET_CMD_ACTION ? DPIF_UC_ACTION
+ : -1);
+ if (type < 0) {
+ return EINVAL;
+ }
+
+ memset(upcall, 0, sizeof *upcall);
+ upcall->type = type;
+ upcall->packet = buf;
+ upcall->packet->data = CONST_CAST(struct nlattr *,
+ nl_attr_get(a[OVS_PACKET_ATTR_PACKET]));
+ upcall->packet->size = nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]);
+ upcall->key = CONST_CAST(struct nlattr *,
+ nl_attr_get(a[OVS_PACKET_ATTR_KEY]));
+ upcall->key_len = nl_attr_get_size(a[OVS_PACKET_ATTR_KEY]);
+ upcall->userdata = (a[OVS_PACKET_ATTR_USERDATA]
+ ? nl_attr_get_u64(a[OVS_PACKET_ATTR_USERDATA])
+ : 0);
+ *dp_ifindex = ovs_header->dp_ifindex;
+
+ return 0;
+}
+
+static int
+dpif_linux_recv(struct dpif *dpif_, struct dpif_upcall *upcall,
+ struct ofpbuf *buf)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ int read_tries = 0;
+
+ if (dpif->epoll_fd < 0) {
+ return EAGAIN;
+ }
+
+ if (!dpif->ready_mask) {
+ struct epoll_event events[N_CHANNELS];
+ int retval;
+ int i;
+
+ do {
+ retval = epoll_wait(dpif->epoll_fd, events, N_CHANNELS, 0);
+ } while (retval < 0 && errno == EINTR);
+ if (retval < 0) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+ VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", strerror(errno));
+ }
+
+ for (i = 0; i < retval; i++) {
+ dpif->ready_mask |= 1u << events[i].data.u32;
+ }
+ }
+
+ while (dpif->ready_mask) {
+ int indx = ffs(dpif->ready_mask) - 1;
+ struct dpif_channel *ch = &dpif->channels[indx];
+
+ dpif->ready_mask &= ~(1u << indx);
+
+ for (;;) {
+ int dp_ifindex;
+ int error;
+
+ if (++read_tries > 50) {
+ return EAGAIN;
+ }
+
+ error = nl_sock_recv(ch->sock, buf, false);
+ if (error == ENOBUFS) {
+ /* ENOBUFS typically means that we've received so many
+ * packets that the buffer overflowed. Try again
+ * immediately because there's almost certainly a packet
+ * waiting for us. */
+ report_loss(dpif_, ch);
+ continue;
+ }
+
+ ch->last_poll = time_msec();
+ if (error) {
+ if (error == EAGAIN) {
+ break;
+ }
+ return error;
+ }
+
+ error = parse_odp_packet(buf, upcall, &dp_ifindex);
+ if (!error && dp_ifindex == dpif->dp_ifindex) {
+ const struct nlattr *in_port;
+
+ in_port = nl_attr_find__(upcall->key, upcall->key_len,
+ OVS_KEY_ATTR_IN_PORT);
+ if (in_port) {
+ update_sketch(ch, nl_attr_get_u32(in_port));
+ }
+ return 0;
+ }
+ if (error) {
+ return error;
+ }
+ }
+ }
+
+ return EAGAIN;
+}
+
+static void
+dpif_linux_recv_wait(struct dpif *dpif_)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ if (dpif->epoll_fd < 0) {
+ return;
+ }
+
+ poll_fd_wait(dpif->epoll_fd, POLLIN);
+}
+
+static void
+dpif_linux_recv_purge(struct dpif *dpif_)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ struct dpif_channel *ch;
+
+ if (dpif->epoll_fd < 0) {
+ return;
+ }
+
+ for (ch = dpif->channels; ch < &dpif->channels[N_CHANNELS]; ch++) {
+ nl_sock_drain(ch->sock);
+ }
+}
+
+const struct dpif_class dpif_linux_class = {
+ "system",
+ dpif_linux_enumerate,
+ dpif_linux_open,
+ dpif_linux_close,
+ dpif_linux_destroy,
+ dpif_linux_run,
+ dpif_linux_wait,
+ dpif_linux_get_stats,
+ dpif_linux_port_add,
+ dpif_linux_port_del,
+ dpif_linux_port_query_by_number,
+ dpif_linux_port_query_by_name,
+ dpif_linux_get_max_ports,
+ dpif_linux_port_get_pid,
+ dpif_linux_port_dump_start,
+ dpif_linux_port_dump_next,
+ dpif_linux_port_dump_done,
+ dpif_linux_port_poll,
+ dpif_linux_port_poll_wait,
+ dpif_linux_flow_get,
+ dpif_linux_flow_put,
+ dpif_linux_flow_del,
+ dpif_linux_flow_flush,
+ dpif_linux_flow_dump_start,
+ dpif_linux_flow_dump_next,
+ dpif_linux_flow_dump_done,
+ dpif_linux_execute,
+ dpif_linux_operate,
+ dpif_linux_recv_set,
+ dpif_linux_queue_to_priority,
+ dpif_linux_recv,
+ dpif_linux_recv_wait,
+ dpif_linux_recv_purge,
+};
+\f
+static int
+dpif_linux_init(void)
+{
+ static int error = -1;
+
+ if (error < 0) {
+ unsigned int ovs_vport_mcgroup;
+
+ error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
+ &ovs_datapath_family);
+ if (error) {
+ VLOG_ERR("Generic Netlink family '%s' does not exist. "
+ "The Open vSwitch kernel module is probably not loaded.",
+ OVS_DATAPATH_FAMILY);
+ }
+ if (!error) {
+ error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
+ }
+ if (!error) {
+ error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
+ }
+ if (!error) {
+ error = nl_lookup_genl_family(OVS_PACKET_FAMILY,
+ &ovs_packet_family);
+ }
+ if (!error) {
+ error = nl_sock_create(NETLINK_GENERIC, &genl_sock);
+ }
+ if (!error) {
+ error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,
+ &ovs_vport_mcgroup,
+ OVS_VPORT_MCGROUP_FALLBACK_ID);
+ }
+ if (!error) {
+ static struct dpif_linux_vport vport;
+ nln = nln_create(NETLINK_GENERIC, ovs_vport_mcgroup,
+ dpif_linux_nln_parse, &vport);
+ }
+ }
+
+ return error;
+}
+
+bool
+dpif_linux_is_internal_device(const char *name)
+{
+ struct dpif_linux_vport reply;
+ struct ofpbuf *buf;
+ int error;
+
+ error = dpif_linux_vport_get(name, &reply, &buf);
+ if (!error) {
+ ofpbuf_delete(buf);
+ } else if (error != ENODEV && error != ENOENT) {
+ VLOG_WARN_RL(&error_rl, "%s: vport query failed (%s)",
+ name, strerror(error));
+ }
+
+ return reply.type == OVS_VPORT_TYPE_INTERNAL;
+}
+
+int
+dpif_linux_vport_send(int dp_ifindex, uint32_t port_no,
+ const void *data, size_t size)
+{
+ struct ofpbuf actions, key, packet;
+ struct odputil_keybuf keybuf;
+ struct dpif_execute execute;
+ struct flow flow;
+ uint64_t action;
+
+ ofpbuf_use_const(&packet, data, size);
+ flow_extract(&packet, 0, htonll(0), 0, &flow);
+
+ ofpbuf_use_stack(&key, &keybuf, sizeof keybuf);
+ odp_flow_key_from_flow(&key, &flow);
+
+ ofpbuf_use_stack(&actions, &action, sizeof action);
+ nl_msg_put_u32(&actions, OVS_ACTION_ATTR_OUTPUT, port_no);
+
+ execute.key = key.data;
+ execute.key_len = key.size;
+ execute.actions = actions.data;
+ execute.actions_len = actions.size;
+ execute.packet = &packet;
+ return dpif_linux_execute__(dp_ifindex, &execute);
+}
+
+static bool
+dpif_linux_nln_parse(struct ofpbuf *buf, void *vport_)
+{
+ struct dpif_linux_vport *vport = vport_;
+ return dpif_linux_vport_from_ofpbuf(vport, buf) == 0;
+}
+
+static void
+dpif_linux_port_changed(const void *vport_, void *dpif_)
+{
+ const struct dpif_linux_vport *vport = vport_;
+ struct dpif_linux *dpif = dpif_;
+
+ if (vport) {
+ if (vport->dp_ifindex == dpif->dp_ifindex
+ && (vport->cmd == OVS_VPORT_CMD_NEW
+ || vport->cmd == OVS_VPORT_CMD_DEL
+ || vport->cmd == OVS_VPORT_CMD_SET)) {
+ VLOG_DBG("port_changed: dpif:%s vport:%s cmd:%"PRIu8,
+ dpif->dpif.full_name, vport->name, vport->cmd);
+ sset_add(&dpif->changed_ports, vport->name);
+ }
+ } else {
+ dpif->change_error = true;
+ }
+}
+\f
+/* Parses the contents of 'buf', which contains a "struct ovs_header" followed
+ * by Netlink attributes, into 'vport'. Returns 0 if successful, otherwise a
+ * positive errno value.
+ *
+ * 'vport' will contain pointers into 'buf', so the caller should not free
+ * 'buf' while 'vport' is still in use. */
+static int
+dpif_linux_vport_from_ofpbuf(struct dpif_linux_vport *vport,
+ const struct ofpbuf *buf)
+{
+ static const struct nl_policy ovs_vport_policy[] = {
+ [OVS_VPORT_ATTR_PORT_NO] = { .type = NL_A_U32 },
+ [OVS_VPORT_ATTR_TYPE] = { .type = NL_A_U32 },
+ [OVS_VPORT_ATTR_NAME] = { .type = NL_A_STRING, .max_len = IFNAMSIZ },
+ [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NL_A_U32 },
+ [OVS_VPORT_ATTR_STATS] = { NL_POLICY_FOR(struct ovs_vport_stats),
+ .optional = true },
+ [OVS_VPORT_ATTR_ADDRESS] = { .type = NL_A_UNSPEC,
+ .min_len = ETH_ADDR_LEN,
+ .max_len = ETH_ADDR_LEN,
+ .optional = true },
+ [OVS_VPORT_ATTR_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
+ };
+
+ struct nlattr *a[ARRAY_SIZE(ovs_vport_policy)];
+ struct ovs_header *ovs_header;
+ struct nlmsghdr *nlmsg;
+ struct genlmsghdr *genl;
+ struct ofpbuf b;
+
+ dpif_linux_vport_init(vport);
+
+ ofpbuf_use_const(&b, buf->data, buf->size);
+ nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
+ genl = ofpbuf_try_pull(&b, sizeof *genl);
+ ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
+ if (!nlmsg || !genl || !ovs_header
+ || nlmsg->nlmsg_type != ovs_vport_family
+ || !nl_policy_parse(&b, 0, ovs_vport_policy, a,
+ ARRAY_SIZE(ovs_vport_policy))) {
+ return EINVAL;
+ }
+
+ vport->cmd = genl->cmd;
+ vport->dp_ifindex = ovs_header->dp_ifindex;
+ vport->port_no = nl_attr_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
+ vport->type = nl_attr_get_u32(a[OVS_VPORT_ATTR_TYPE]);
+ vport->name = nl_attr_get_string(a[OVS_VPORT_ATTR_NAME]);
+ if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
+ vport->upcall_pid = nl_attr_get(a[OVS_VPORT_ATTR_UPCALL_PID]);
+ }
+ if (a[OVS_VPORT_ATTR_STATS]) {
+ vport->stats = nl_attr_get(a[OVS_VPORT_ATTR_STATS]);
+ }
+ if (a[OVS_VPORT_ATTR_ADDRESS]) {
+ vport->address = nl_attr_get(a[OVS_VPORT_ATTR_ADDRESS]);
+ }
+ if (a[OVS_VPORT_ATTR_OPTIONS]) {
+ vport->options = nl_attr_get(a[OVS_VPORT_ATTR_OPTIONS]);
+ vport->options_len = nl_attr_get_size(a[OVS_VPORT_ATTR_OPTIONS]);
+ }
+ return 0;
+}
+
+/* Appends to 'buf' (which must initially be empty) a "struct ovs_header"
+ * followed by Netlink attributes corresponding to 'vport'. */
+static void
+dpif_linux_vport_to_ofpbuf(const struct dpif_linux_vport *vport,
+ struct ofpbuf *buf)
+{
+ struct ovs_header *ovs_header;
+
+ nl_msg_put_genlmsghdr(buf, 0, ovs_vport_family, NLM_F_REQUEST | NLM_F_ECHO,
+ vport->cmd, OVS_VPORT_VERSION);
+
+ ovs_header = ofpbuf_put_uninit(buf, sizeof *ovs_header);
+ ovs_header->dp_ifindex = vport->dp_ifindex;
+
+ if (vport->port_no != UINT32_MAX) {
+ nl_msg_put_u32(buf, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
+ }
+
+ if (vport->type != OVS_VPORT_TYPE_UNSPEC) {
+ nl_msg_put_u32(buf, OVS_VPORT_ATTR_TYPE, vport->type);
+ }
+
+ if (vport->name) {
+ nl_msg_put_string(buf, OVS_VPORT_ATTR_NAME, vport->name);
+ }
+
+ if (vport->upcall_pid) {
+ nl_msg_put_u32(buf, OVS_VPORT_ATTR_UPCALL_PID, *vport->upcall_pid);
+ }
+
+ if (vport->stats) {
+ nl_msg_put_unspec(buf, OVS_VPORT_ATTR_STATS,
+ vport->stats, sizeof *vport->stats);
+ }
+
+ if (vport->address) {
+ nl_msg_put_unspec(buf, OVS_VPORT_ATTR_ADDRESS,
+ vport->address, ETH_ADDR_LEN);
+ }
+
+ if (vport->options) {
+ nl_msg_put_nested(buf, OVS_VPORT_ATTR_OPTIONS,
+ vport->options, vport->options_len);
+ }
+}
+
+/* Clears 'vport' to "empty" values. */
+void
+dpif_linux_vport_init(struct dpif_linux_vport *vport)
+{
+ memset(vport, 0, sizeof *vport);
+ vport->port_no = UINT32_MAX;
+}
+
+/* Executes 'request' in the kernel datapath. If the command fails, returns a
+ * positive errno value. Otherwise, if 'reply' and 'bufp' are null, returns 0
+ * without doing anything else. If 'reply' and 'bufp' are nonnull, then the
+ * result of the command is expected to be an ovs_vport also, which is decoded
+ * and stored in '*reply' and '*bufp'. The caller must free '*bufp' when the
+ * reply is no longer needed ('reply' will contain pointers into '*bufp'). */
+int
+dpif_linux_vport_transact(const struct dpif_linux_vport *request,
+ struct dpif_linux_vport *reply,
+ struct ofpbuf **bufp)
+{
+ struct ofpbuf *request_buf;
+ int error;
+
+ assert((reply != NULL) == (bufp != NULL));
+
+ error = dpif_linux_init();