+static int
+dpif_linux_execute__(int dp_ifindex, const struct dpif_execute *execute)
+{
+ uint64_t request_stub[1024 / 8];
+ struct ofpbuf request;
+ int error;
+
+ ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
+ dpif_linux_encode_execute(dp_ifindex, execute, &request);
+ error = nl_sock_transact(genl_sock, &request, NULL);
+ ofpbuf_uninit(&request);
+
+ return error;
+}
+
+static int
+dpif_linux_execute(struct dpif *dpif_, const struct dpif_execute *execute)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ return dpif_linux_execute__(dpif->dp_ifindex, execute);
+}
+
+#define MAX_OPS 50
+
+static void
+dpif_linux_operate__(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ struct op_auxdata {
+ struct nl_transaction txn;
+
+ struct ofpbuf request;
+ uint64_t request_stub[1024 / 8];
+
+ struct ofpbuf reply;
+ uint64_t reply_stub[1024 / 8];
+ } auxes[MAX_OPS];
+
+ struct nl_transaction *txnsp[MAX_OPS];
+ size_t i;
+
+ assert(n_ops <= MAX_OPS);
+ for (i = 0; i < n_ops; i++) {
+ struct op_auxdata *aux = &auxes[i];
+ struct dpif_op *op = ops[i];
+ struct dpif_flow_put *put;
+ struct dpif_flow_del *del;
+ struct dpif_execute *execute;
+ struct dpif_linux_flow flow;
+
+ ofpbuf_use_stub(&aux->request,
+ aux->request_stub, sizeof aux->request_stub);
+ aux->txn.request = &aux->request;
+
+ ofpbuf_use_stub(&aux->reply, aux->reply_stub, sizeof aux->reply_stub);
+ aux->txn.reply = NULL;
+
+ switch (op->type) {
+ case DPIF_OP_FLOW_PUT:
+ put = &op->u.flow_put;
+ dpif_linux_init_flow_put(dpif_, put, &flow);
+ if (put->stats) {
+ flow.nlmsg_flags |= NLM_F_ECHO;
+ aux->txn.reply = &aux->reply;
+ }
+ dpif_linux_flow_to_ofpbuf(&flow, &aux->request);
+ break;
+
+ case DPIF_OP_FLOW_DEL:
+ del = &op->u.flow_del;
+ dpif_linux_init_flow_del(dpif_, del, &flow);
+ if (del->stats) {
+ flow.nlmsg_flags |= NLM_F_ECHO;
+ aux->txn.reply = &aux->reply;
+ }
+ dpif_linux_flow_to_ofpbuf(&flow, &aux->request);
+ break;
+
+ case DPIF_OP_EXECUTE:
+ execute = &op->u.execute;
+ dpif_linux_encode_execute(dpif->dp_ifindex, execute,
+ &aux->request);
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+ }
+
+ for (i = 0; i < n_ops; i++) {
+ txnsp[i] = &auxes[i].txn;
+ }
+ nl_sock_transact_multiple(genl_sock, txnsp, n_ops);
+
+ for (i = 0; i < n_ops; i++) {
+ struct op_auxdata *aux = &auxes[i];
+ struct nl_transaction *txn = &auxes[i].txn;
+ struct dpif_op *op = ops[i];
+ struct dpif_flow_put *put;
+ struct dpif_flow_del *del;
+
+ op->error = txn->error;
+
+ switch (op->type) {
+ case DPIF_OP_FLOW_PUT:
+ put = &op->u.flow_put;
+ if (put->stats) {
+ if (!op->error) {
+ struct dpif_linux_flow reply;
+
+ op->error = dpif_linux_flow_from_ofpbuf(&reply,
+ txn->reply);
+ if (!op->error) {
+ dpif_linux_flow_get_stats(&reply, put->stats);
+ }
+ }
+
+ if (op->error) {
+ memset(put->stats, 0, sizeof *put->stats);
+ }
+ }
+ break;
+
+ case DPIF_OP_FLOW_DEL:
+ del = &op->u.flow_del;
+ if (del->stats) {
+ if (!op->error) {
+ struct dpif_linux_flow reply;
+
+ op->error = dpif_linux_flow_from_ofpbuf(&reply,
+ txn->reply);
+ if (!op->error) {
+ dpif_linux_flow_get_stats(&reply, del->stats);
+ }
+ }
+
+ if (op->error) {
+ memset(del->stats, 0, sizeof *del->stats);
+ }
+ }
+ break;
+
+ case DPIF_OP_EXECUTE:
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+
+ ofpbuf_uninit(&aux->request);
+ ofpbuf_uninit(&aux->reply);
+ }
+}
+
+static void
+dpif_linux_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
+{
+ while (n_ops > 0) {
+ size_t chunk = MIN(n_ops, MAX_OPS);
+ dpif_linux_operate__(dpif, ops, chunk);
+ ops += chunk;
+ n_ops -= chunk;
+ }
+}
+
+static void
+set_upcall_pids(struct dpif *dpif_)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ struct dpif_port_dump port_dump;
+ struct dpif_port port;
+ int error;
+
+ DPIF_PORT_FOR_EACH (&port, &port_dump, &dpif->dpif) {
+ uint32_t upcall_pid = dpif_linux_port_get_pid(dpif_, port.port_no);
+ struct dpif_linux_vport vport_request;
+
+ dpif_linux_vport_init(&vport_request);
+ vport_request.cmd = OVS_VPORT_CMD_SET;
+ vport_request.dp_ifindex = dpif->dp_ifindex;
+ vport_request.port_no = port.port_no;
+ vport_request.upcall_pid = &upcall_pid;
+ error = dpif_linux_vport_transact(&vport_request, NULL, NULL);
+ if (!error) {
+ VLOG_DBG("%s: assigning port %"PRIu32" to netlink pid %"PRIu32,
+ dpif_name(&dpif->dpif), vport_request.port_no,
+ upcall_pid);
+ } else {
+ VLOG_WARN_RL(&error_rl, "%s: failed to set upcall pid on port: %s",
+ dpif_name(&dpif->dpif), strerror(error));
+ }
+ }
+}
+
+static int
+dpif_linux_recv_set(struct dpif *dpif_, bool enable)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ if ((dpif->epoll_fd >= 0) == enable) {
+ return 0;
+ }
+
+ if (!enable) {
+ destroy_channels(dpif);
+ } else {
+ struct dpif_channel *ch;
+ int error;
+
+ dpif->epoll_fd = epoll_create(N_CHANNELS);
+ if (dpif->epoll_fd < 0) {
+ return errno;
+ }
+
+ for (ch = dpif->channels; ch < &dpif->channels[N_CHANNELS]; ch++) {
+ int indx = ch - dpif->channels;
+ struct epoll_event event;
+
+ error = nl_sock_create(NETLINK_GENERIC, &ch->sock);
+ if (error) {
+ destroy_channels(dpif);
+ return error;
+ }
+
+ memset(&event, 0, sizeof event);
+ event.events = EPOLLIN;
+ event.data.u32 = indx;
+ if (epoll_ctl(dpif->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(ch->sock),
+ &event) < 0) {
+ error = errno;
+ destroy_channels(dpif);
+ return error;
+ }
+
+ memset(ch->sketches, 0, sizeof ch->sketches);
+ ch->last_poll = LLONG_MIN;
+ }
+
+ dpif->ready_mask = 0;
+ dpif->next_scale = time_msec() + SCALE_INTERVAL;
+ }
+
+ set_upcall_pids(dpif_);
+
+ return 0;
+}
+
+static int
+dpif_linux_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
+ uint32_t queue_id, uint32_t *priority)
+{
+ if (queue_id < 0xf000) {
+ *priority = TC_H_MAKE(1 << 16, queue_id + 1);
+ return 0;
+ } else {
+ return EINVAL;
+ }
+}
+
+static int
+parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
+ int *dp_ifindex)
+{
+ static const struct nl_policy ovs_packet_policy[] = {
+ /* Always present. */
+ [OVS_PACKET_ATTR_PACKET] = { .type = NL_A_UNSPEC,
+ .min_len = ETH_HEADER_LEN },
+ [OVS_PACKET_ATTR_KEY] = { .type = NL_A_NESTED },
+
+ /* OVS_PACKET_CMD_ACTION only. */
+ [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_U64, .optional = true },
+ };
+
+ struct ovs_header *ovs_header;
+ struct nlattr *a[ARRAY_SIZE(ovs_packet_policy)];
+ struct nlmsghdr *nlmsg;
+ struct genlmsghdr *genl;
+ struct ofpbuf b;
+ int type;
+
+ ofpbuf_use_const(&b, buf->data, buf->size);
+
+ nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
+ genl = ofpbuf_try_pull(&b, sizeof *genl);
+ ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
+ if (!nlmsg || !genl || !ovs_header
+ || nlmsg->nlmsg_type != ovs_packet_family
+ || !nl_policy_parse(&b, 0, ovs_packet_policy, a,
+ ARRAY_SIZE(ovs_packet_policy))) {
+ return EINVAL;
+ }
+
+ type = (genl->cmd == OVS_PACKET_CMD_MISS ? DPIF_UC_MISS
+ : genl->cmd == OVS_PACKET_CMD_ACTION ? DPIF_UC_ACTION
+ : -1);
+ if (type < 0) {
+ return EINVAL;
+ }
+
+ memset(upcall, 0, sizeof *upcall);
+ upcall->type = type;
+ upcall->packet = buf;
+ upcall->packet->data = CONST_CAST(struct nlattr *,
+ nl_attr_get(a[OVS_PACKET_ATTR_PACKET]));
+ upcall->packet->size = nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]);
+ upcall->key = CONST_CAST(struct nlattr *,
+ nl_attr_get(a[OVS_PACKET_ATTR_KEY]));
+ upcall->key_len = nl_attr_get_size(a[OVS_PACKET_ATTR_KEY]);
+ upcall->userdata = (a[OVS_PACKET_ATTR_USERDATA]
+ ? nl_attr_get_u64(a[OVS_PACKET_ATTR_USERDATA])
+ : 0);
+ *dp_ifindex = ovs_header->dp_ifindex;
+
+ return 0;
+}
+
+static int
+dpif_linux_recv(struct dpif *dpif_, struct dpif_upcall *upcall,
+ struct ofpbuf *buf)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ int read_tries = 0;
+
+ if (dpif->epoll_fd < 0) {
+ return EAGAIN;
+ }
+
+ if (!dpif->ready_mask) {
+ struct epoll_event events[N_CHANNELS];
+ int retval;
+ int i;
+
+ do {
+ retval = epoll_wait(dpif->epoll_fd, events, N_CHANNELS, 0);
+ } while (retval < 0 && errno == EINTR);
+ if (retval < 0) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+ VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", strerror(errno));
+ }
+
+ for (i = 0; i < retval; i++) {
+ dpif->ready_mask |= 1u << events[i].data.u32;
+ }
+ }
+
+ while (dpif->ready_mask) {
+ int indx = ffs(dpif->ready_mask) - 1;
+ struct dpif_channel *ch = &dpif->channels[indx];
+
+ dpif->ready_mask &= ~(1u << indx);
+
+ for (;;) {
+ int dp_ifindex;
+ int error;
+
+ if (++read_tries > 50) {
+ return EAGAIN;
+ }
+
+ error = nl_sock_recv(ch->sock, buf, false);
+ if (error == ENOBUFS) {
+ /* ENOBUFS typically means that we've received so many
+ * packets that the buffer overflowed. Try again
+ * immediately because there's almost certainly a packet
+ * waiting for us. */
+ report_loss(dpif_, ch);
+ continue;
+ }
+
+ ch->last_poll = time_msec();
+ if (error) {
+ if (error == EAGAIN) {
+ break;
+ }
+ return error;
+ }
+
+ error = parse_odp_packet(buf, upcall, &dp_ifindex);
+ if (!error && dp_ifindex == dpif->dp_ifindex) {
+ const struct nlattr *in_port;
+
+ in_port = nl_attr_find__(upcall->key, upcall->key_len,
+ OVS_KEY_ATTR_IN_PORT);
+ if (in_port) {
+ update_sketch(ch, nl_attr_get_u32(in_port));
+ }
+ return 0;
+ }
+ if (error) {
+ return error;
+ }
+ }
+ }
+
+ return EAGAIN;
+}
+
+static void
+dpif_linux_recv_wait(struct dpif *dpif_)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+
+ if (dpif->epoll_fd < 0) {
+ return;
+ }
+
+ poll_fd_wait(dpif->epoll_fd, POLLIN);
+}
+
+static void
+dpif_linux_recv_purge(struct dpif *dpif_)
+{
+ struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+ struct dpif_channel *ch;
+
+ if (dpif->epoll_fd < 0) {
+ return;
+ }
+
+ for (ch = dpif->channels; ch < &dpif->channels[N_CHANNELS]; ch++) {
+ nl_sock_drain(ch->sock);
+ }
+}
+
+const struct dpif_class dpif_linux_class = {
+ "system",
+ dpif_linux_enumerate,