+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ return dpif_linux_execute__(dpif->dp_ifindex, key, key_len,
+ actions, actions_len, packet);
+}
+
+static void
+dpif_linux_operate(struct dpif *dpif_, union dpif_op **ops, size_t n_ops)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ struct nl_transaction **txnsp;
+ struct nl_transaction *txns;
+ size_t i;
+
+ txns = xmalloc(n_ops * sizeof *txns);
+ for (i = 0; i < n_ops; i++) {
+ struct nl_transaction *txn = &txns[i];
+ union dpif_op *op = ops[i];
+
+ if (op->type == DPIF_OP_FLOW_PUT) {
+ struct dpif_flow_put *put = &op->flow_put;
+ struct dpif_linux_flow request;
+
+ dpif_linux_init_flow_put(dpif_, put->flags, put->key, put->key_len,
+ put->actions, put->actions_len,
+ &request);
+ if (put->stats) {
+ request.nlmsg_flags |= NLM_F_ECHO;
+ }
+ txn->request = ofpbuf_new(1024);
+ dpif_linux_flow_to_ofpbuf(&request, txn->request);
+ } else if (op->type == DPIF_OP_EXECUTE) {
+ struct dpif_execute *execute = &op->execute;
+
+ txn->request = dpif_linux_encode_execute(
+ dpif->dp_ifindex, execute->key, execute->key_len,
+ execute->actions, execute->actions_len, execute->packet);
+ } else {
+ NOT_REACHED();
+ }
+ }
+
+ txnsp = xmalloc(n_ops * sizeof *txnsp);
+ for (i = 0; i < n_ops; i++) {
+ txnsp[i] = &txns[i];
+ }
+
+ nl_sock_transact_multiple(genl_sock, txnsp, n_ops);
+
+ free(txnsp);
+
+ for (i = 0; i < n_ops; i++) {
+ struct nl_transaction *txn = &txns[i];
+ union dpif_op *op = ops[i];
+
+ if (op->type == DPIF_OP_FLOW_PUT) {
+ struct dpif_flow_put *put = &op->flow_put;
+ int error = txn->error;
+
+ if (!error && put->stats) {
+ struct dpif_linux_flow reply;
+
+ error = dpif_linux_flow_from_ofpbuf(&reply, txn->reply);
+ if (!error) {
+ dpif_linux_flow_get_stats(&reply, put->stats);
+ }
+ }
+ put->error = error;
+ } else if (op->type == DPIF_OP_EXECUTE) {
+ struct dpif_execute *execute = &op->execute;
+
+ execute->error = txn->error;
+ } else {
+ NOT_REACHED();
+ }
+
+ ofpbuf_delete(txn->request);
+ ofpbuf_delete(txn->reply);
+ }
+ free(txns);
+}
+
+static int
+dpif_linux_recv_get_mask(const struct dpif *dpif_, int *listen_mask)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ *listen_mask = dpif->listen_mask;
+ return 0;
+}
+
+static void
+set_upcall_pids(struct dpif *dpif_)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ struct dpif_port_dump port_dump;
+ struct dpif_port port;
+ int error;
+
+ DPIF_PORT_FOR_EACH (&port, &port_dump, &dpif->dpif) {
+ uint32_t upcall_pid = dpif_linux_port_get_pid__(dpif_, port.port_no,
+ DPIF_UC_MISS);
+ struct dpif_linux_vport vport_request;
+
+ dpif_linux_vport_init(&vport_request);
+ vport_request.cmd = OVS_VPORT_CMD_SET;
+ vport_request.dp_ifindex = dpif->dp_ifindex;
+ vport_request.port_no = port.port_no;
+ vport_request.upcall_pid = &upcall_pid;
+ error = dpif_linux_vport_transact(&vport_request, NULL, NULL);
+ if (!error) {
+ VLOG_DBG("%s: assigning port %"PRIu32" to netlink pid %"PRIu32,
+ dpif_name(&dpif->dpif), vport_request.port_no,
+ upcall_pid);
+ } else {
+ VLOG_WARN_RL(&error_rl, "%s: failed to set upcall pid on port: %s",
+ dpif_name(&dpif->dpif), strerror(error));
+ }
+ }
+}
+
+static int
+dpif_linux_recv_set_mask(struct dpif *dpif_, int listen_mask)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ if (listen_mask == dpif->listen_mask) {
+ return 0;
+ }
+
+ if (!listen_mask) {
+ destroy_upcall_socks(dpif);
+ } else if (!dpif->listen_mask) {
+ int i;
+ int error;
+
+ dpif->epoll_fd = epoll_create(N_UPCALL_SOCKS);
+ if (dpif->epoll_fd < 0) {
+ return errno;
+ }
+
+ for (i = 0; i < N_UPCALL_SOCKS; i++) {
+ struct epoll_event event;
+
+ error = nl_sock_create(NETLINK_GENERIC, &dpif->upcall_socks[i]);
+ if (error) {
+ destroy_upcall_socks(dpif);
+ return error;
+ }
+
+ memset(&event, 0, sizeof event);
+ event.events = EPOLLIN;
+ event.data.u32 = i;
+ if (epoll_ctl(dpif->epoll_fd, EPOLL_CTL_ADD,
+ nl_sock_fd(dpif->upcall_socks[i]), &event) < 0) {
+ error = errno;
+ destroy_upcall_socks(dpif);
+ return error;
+ }
+ }
+
+ dpif->ready_mask = 0;
+ }
+
+ dpif->listen_mask = listen_mask;
+ set_upcall_pids(dpif_);
+
+ return 0;
+}
+
+static int
+dpif_linux_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
+ uint32_t queue_id, uint32_t *priority)
+{
+ if (queue_id < 0xf000) {
+ *priority = TC_H_MAKE(1 << 16, queue_id + 1);
+ return 0;
+ } else {
+ return EINVAL;
+ }
+}
+
+static int
+parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
+ int *dp_ifindex)
+{
+ static const struct nl_policy ovs_packet_policy[] = {
+ /* Always present. */
+ [OVS_PACKET_ATTR_PACKET] = { .type = NL_A_UNSPEC,
+ .min_len = ETH_HEADER_LEN },
+ [OVS_PACKET_ATTR_KEY] = { .type = NL_A_NESTED },
+
+ /* OVS_PACKET_CMD_ACTION only. */
+ [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_U64, .optional = true },
+ };
+
+ struct ovs_header *ovs_header;
+ struct nlattr *a[ARRAY_SIZE(ovs_packet_policy)];
+ struct nlmsghdr *nlmsg;
+ struct genlmsghdr *genl;
+ struct ofpbuf b;
+ int type;
+
+ ofpbuf_use_const(&b, buf->data, buf->size);
+
+ nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
+ genl = ofpbuf_try_pull(&b, sizeof *genl);
+ ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
+ if (!nlmsg || !genl || !ovs_header
+ || nlmsg->nlmsg_type != ovs_packet_family
+ || !nl_policy_parse(&b, 0, ovs_packet_policy, a,
+ ARRAY_SIZE(ovs_packet_policy))) {
+ return EINVAL;
+ }
+
+ type = (genl->cmd == OVS_PACKET_CMD_MISS ? DPIF_UC_MISS
+ : genl->cmd == OVS_PACKET_CMD_ACTION ? DPIF_UC_ACTION
+ : -1);
+ if (type < 0) {
+ return EINVAL;
+ }
+
+ memset(upcall, 0, sizeof *upcall);
+ upcall->type = type;
+ upcall->packet = buf;
+ upcall->packet->data = (void *) nl_attr_get(a[OVS_PACKET_ATTR_PACKET]);
+ upcall->packet->size = nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]);
+ upcall->key = (void *) nl_attr_get(a[OVS_PACKET_ATTR_KEY]);
+ upcall->key_len = nl_attr_get_size(a[OVS_PACKET_ATTR_KEY]);
+ upcall->userdata = (a[OVS_PACKET_ATTR_USERDATA]
+ ? nl_attr_get_u64(a[OVS_PACKET_ATTR_USERDATA])
+ : 0);
+ *dp_ifindex = ovs_header->dp_ifindex;
+
+ return 0;
+}
+
+static int
+dpif_linux_recv(struct dpif *dpif_, struct dpif_upcall *upcall)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ int read_tries = 0;
+
+ if (!dpif->listen_mask) {
+ return EAGAIN;
+ }
+
+ if (!dpif->ready_mask) {
+ struct epoll_event events[N_UPCALL_SOCKS];
+ int retval;
+ int i;
+
+ do {
+ retval = epoll_wait(dpif->epoll_fd, events, N_UPCALL_SOCKS, 0);
+ } while (retval < 0 && errno == EINTR);
+ if (retval < 0) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+ VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", strerror(errno));
+ }
+
+ for (i = 0; i < retval; i++) {
+ dpif->ready_mask |= 1u << events[i].data.u32;
+ }
+ }
+
+ while (dpif->ready_mask) {
+ int indx = ffs(dpif->ready_mask) - 1;
+ struct nl_sock *upcall_sock = dpif->upcall_socks[indx];
+
+ dpif->ready_mask &= ~(1u << indx);
+
+ for (;;) {
+ struct ofpbuf *buf;
+ int dp_ifindex;
+ int error;
+
+ if (++read_tries > 50) {
+ return EAGAIN;
+ }
+
+ error = nl_sock_recv(upcall_sock, &buf, false);
+ if (error == EAGAIN) {
+ break;
+ } else if (error) {
+ return error;
+ }
+
+ error = parse_odp_packet(buf, upcall, &dp_ifindex);
+ if (!error
+ && dp_ifindex == dpif->dp_ifindex
+ && dpif->listen_mask & (1u << upcall->type)) {
+ return 0;
+ }
+
+ ofpbuf_delete(buf);
+ if (error) {
+ return error;
+ }
+ }
+ }
+
+ return EAGAIN;
+}
+
+static void
+dpif_linux_recv_wait(struct dpif *dpif_)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ if (!dpif->listen_mask) {
+ return;
+ }
+
+ poll_fd_wait(dpif->epoll_fd, POLLIN);
+}
+
+static void
+dpif_linux_recv_purge(struct dpif *dpif_)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ int i;
+
+ if (!dpif->listen_mask) {
+ return;
+ }
+
+ for (i = 0; i < N_UPCALL_SOCKS; i++) {
+ nl_sock_drain(dpif->upcall_socks[i]);
+ }
+}
+
+const struct dpif_class dpif_linux_class = {
+ "system",
+ dpif_linux_enumerate,
+ dpif_linux_open,
+ dpif_linux_close,
+ dpif_linux_destroy,
+ dpif_linux_run,
+ dpif_linux_wait,
+ dpif_linux_get_stats,
+ dpif_linux_port_add,
+ dpif_linux_port_del,
+ dpif_linux_port_query_by_number,
+ dpif_linux_port_query_by_name,
+ dpif_linux_get_max_ports,
+ dpif_linux_port_get_pid,
+ dpif_linux_port_dump_start,
+ dpif_linux_port_dump_next,
+ dpif_linux_port_dump_done,
+ dpif_linux_port_poll,
+ dpif_linux_port_poll_wait,
+ dpif_linux_flow_get,
+ dpif_linux_flow_put,
+ dpif_linux_flow_del,
+ dpif_linux_flow_flush,
+ dpif_linux_flow_dump_start,
+ dpif_linux_flow_dump_next,
+ dpif_linux_flow_dump_done,
+ dpif_linux_execute,
+ dpif_linux_operate,
+ dpif_linux_recv_get_mask,
+ dpif_linux_recv_set_mask,
+ dpif_linux_queue_to_priority,
+ dpif_linux_recv,
+ dpif_linux_recv_wait,
+ dpif_linux_recv_purge,
+};
+\f
+static int
+dpif_linux_init(void)
+{
+ static int error = -1;
+
+ if (error < 0) {
+ unsigned int ovs_vport_mcgroup;
+
+ error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
+ &ovs_datapath_family);
+ if (error) {
+ VLOG_ERR("Generic Netlink family '%s' does not exist. "
+ "The Open vSwitch kernel module is probably not loaded.",
+ OVS_DATAPATH_FAMILY);
+ }
+ if (!error) {
+ error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
+ }
+ if (!error) {
+ error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
+ }
+ if (!error) {
+ error = nl_lookup_genl_family(OVS_PACKET_FAMILY,
+ &ovs_packet_family);
+ }
+ if (!error) {
+ error = nl_sock_create(NETLINK_GENERIC, &genl_sock);
+ }
+ if (!error) {
+ error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,
+ &ovs_vport_mcgroup,
+ OVS_VPORT_MCGROUP_FALLBACK_ID);
+ }
+ if (!error) {
+ static struct dpif_linux_vport vport;
+ nln = nln_create(NETLINK_GENERIC, ovs_vport_mcgroup,
+ dpif_linux_nln_parse, &vport);
+ }
+ }
+
+ return error;
+}
+
+bool
+dpif_linux_is_internal_device(const char *name)
+{
+ struct dpif_linux_vport reply;
+ struct ofpbuf *buf;
+ int error;
+
+ error = dpif_linux_vport_get(name, &reply, &buf);
+ if (!error) {
+ ofpbuf_delete(buf);
+ } else if (error != ENODEV && error != ENOENT) {
+ VLOG_WARN_RL(&error_rl, "%s: vport query failed (%s)",
+ name, strerror(error));
+ }
+
+ return reply.type == OVS_VPORT_TYPE_INTERNAL;
+}
+
+int
+dpif_linux_vport_send(int dp_ifindex, uint32_t port_no,
+ const void *data, size_t size)
+{
+ struct ofpbuf actions, key, packet;
+ struct odputil_keybuf keybuf;
+ struct flow flow;
+ uint64_t action;
+
+ ofpbuf_use_const(&packet, data, size);
+ flow_extract(&packet, 0, htonll(0), 0, &flow);
+
+ ofpbuf_use_stack(&key, &keybuf, sizeof keybuf);
+ odp_flow_key_from_flow(&key, &flow);
+
+ ofpbuf_use_stack(&actions, &action, sizeof action);
+ nl_msg_put_u32(&actions, OVS_ACTION_ATTR_OUTPUT, port_no);
+
+ return dpif_linux_execute__(dp_ifindex, key.data, key.size,
+ actions.data, actions.size, &packet);
+}
+
+static bool
+dpif_linux_nln_parse(struct ofpbuf *buf, void *vport_)
+{
+ struct dpif_linux_vport *vport = vport_;
+ return dpif_linux_vport_from_ofpbuf(vport, buf) == 0;
+}
+
+static void
+dpif_linux_port_changed(const void *vport_, void *dpif_)
+{
+ const struct dpif_linux_vport *vport = vport_;
+ struct dpif_linux *dpif = dpif_;
+
+ if (vport) {
+ if (vport->dp_ifindex == dpif->dp_ifindex
+ && (vport->cmd == OVS_VPORT_CMD_NEW
+ || vport->cmd == OVS_VPORT_CMD_DEL
+ || vport->cmd == OVS_VPORT_CMD_SET)) {
+ VLOG_DBG("port_changed: dpif:%s vport:%s cmd:%"PRIu8,
+ dpif->dpif.full_name, vport->name, vport->cmd);
+ sset_add(&dpif->changed_ports, vport->name);
+ }
+ } else {
+ dpif->change_error = true;
+ }
+}
+\f
+/* Parses the contents of 'buf', which contains a "struct ovs_header" followed
+ * by Netlink attributes, into 'vport'. Returns 0 if successful, otherwise a
+ * positive errno value.
+ *
+ * 'vport' will contain pointers into 'buf', so the caller should not free
+ * 'buf' while 'vport' is still in use. */
+static int
+dpif_linux_vport_from_ofpbuf(struct dpif_linux_vport *vport,
+ const struct ofpbuf *buf)
+{
+ static const struct nl_policy ovs_vport_policy[] = {
+ [OVS_VPORT_ATTR_PORT_NO] = { .type = NL_A_U32 },
+ [OVS_VPORT_ATTR_TYPE] = { .type = NL_A_U32 },
+ [OVS_VPORT_ATTR_NAME] = { .type = NL_A_STRING, .max_len = IFNAMSIZ },
+ [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NL_A_U32 },
+ [OVS_VPORT_ATTR_STATS] = { NL_POLICY_FOR(struct ovs_vport_stats),
+ .optional = true },
+ [OVS_VPORT_ATTR_ADDRESS] = { .type = NL_A_UNSPEC,
+ .min_len = ETH_ADDR_LEN,
+ .max_len = ETH_ADDR_LEN,
+ .optional = true },
+ [OVS_VPORT_ATTR_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
+ };
+
+ struct nlattr *a[ARRAY_SIZE(ovs_vport_policy)];
+ struct ovs_header *ovs_header;
+ struct nlmsghdr *nlmsg;
+ struct genlmsghdr *genl;
+ struct ofpbuf b;
+
+ dpif_linux_vport_init(vport);
+
+ ofpbuf_use_const(&b, buf->data, buf->size);
+ nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
+ genl = ofpbuf_try_pull(&b, sizeof *genl);
+ ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
+ if (!nlmsg || !genl || !ovs_header
+ || nlmsg->nlmsg_type != ovs_vport_family
+ || !nl_policy_parse(&b, 0, ovs_vport_policy, a,
+ ARRAY_SIZE(ovs_vport_policy))) {
+ return EINVAL;
+ }
+
+ vport->cmd = genl->cmd;
+ vport->dp_ifindex = ovs_header->dp_ifindex;
+ vport->port_no = nl_attr_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
+ vport->type = nl_attr_get_u32(a[OVS_VPORT_ATTR_TYPE]);
+ vport->name = nl_attr_get_string(a[OVS_VPORT_ATTR_NAME]);
+ if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
+ vport->upcall_pid = nl_attr_get(a[OVS_VPORT_ATTR_UPCALL_PID]);
+ }
+ if (a[OVS_VPORT_ATTR_STATS]) {
+ vport->stats = nl_attr_get(a[OVS_VPORT_ATTR_STATS]);
+ }
+ if (a[OVS_VPORT_ATTR_ADDRESS]) {
+ vport->address = nl_attr_get(a[OVS_VPORT_ATTR_ADDRESS]);
+ }
+ if (a[OVS_VPORT_ATTR_OPTIONS]) {
+ vport->options = nl_attr_get(a[OVS_VPORT_ATTR_OPTIONS]);
+ vport->options_len = nl_attr_get_size(a[OVS_VPORT_ATTR_OPTIONS]);
+ }
+ return 0;
+}
+
+/* Appends to 'buf' (which must initially be empty) a "struct ovs_header"
+ * followed by Netlink attributes corresponding to 'vport'. */
+static void
+dpif_linux_vport_to_ofpbuf(const struct dpif_linux_vport *vport,
+ struct ofpbuf *buf)
+{
+ struct ovs_header *ovs_header;
+
+ nl_msg_put_genlmsghdr(buf, 0, ovs_vport_family, NLM_F_REQUEST | NLM_F_ECHO,
+ vport->cmd, OVS_VPORT_VERSION);
+
+ ovs_header = ofpbuf_put_uninit(buf, sizeof *ovs_header);
+ ovs_header->dp_ifindex = vport->dp_ifindex;
+
+ if (vport->port_no != UINT32_MAX) {
+ nl_msg_put_u32(buf, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
+ }
+
+ if (vport->type != OVS_VPORT_TYPE_UNSPEC) {
+ nl_msg_put_u32(buf, OVS_VPORT_ATTR_TYPE, vport->type);
+ }
+
+ if (vport->name) {
+ nl_msg_put_string(buf, OVS_VPORT_ATTR_NAME, vport->name);
+ }
+
+ if (vport->upcall_pid) {
+ nl_msg_put_u32(buf, OVS_VPORT_ATTR_UPCALL_PID, *vport->upcall_pid);
+ }
+
+ if (vport->stats) {
+ nl_msg_put_unspec(buf, OVS_VPORT_ATTR_STATS,
+ vport->stats, sizeof *vport->stats);
+ }
+
+ if (vport->address) {
+ nl_msg_put_unspec(buf, OVS_VPORT_ATTR_ADDRESS,
+ vport->address, ETH_ADDR_LEN);
+ }
+
+ if (vport->options) {
+ nl_msg_put_nested(buf, OVS_VPORT_ATTR_OPTIONS,
+ vport->options, vport->options_len);
+ }
+}
+
+/* Clears 'vport' to "empty" values. */
+void
+dpif_linux_vport_init(struct dpif_linux_vport *vport)
+{
+ memset(vport, 0, sizeof *vport);
+ vport->port_no = UINT32_MAX;
+}
+
+/* Executes 'request' in the kernel datapath. If the command fails, returns a
+ * positive errno value. Otherwise, if 'reply' and 'bufp' are null, returns 0
+ * without doing anything else. If 'reply' and 'bufp' are nonnull, then the
+ * result of the command is expected to be an ovs_vport also, which is decoded
+ * and stored in '*reply' and '*bufp'. The caller must free '*bufp' when the
+ * reply is no longer needed ('reply' will contain pointers into '*bufp'). */
+int
+dpif_linux_vport_transact(const struct dpif_linux_vport *request,
+ struct dpif_linux_vport *reply,
+ struct ofpbuf **bufp)
+{
+ struct ofpbuf *request_buf;