From: Ben Pfaff Date: Tue, 27 Apr 2010 16:40:46 +0000 (-0700) Subject: ofproto: Avoid buffer copy in OFPT_PACKET_IN path. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=43253595291318833572088595769b45a79c9c54;p=openvswitch ofproto: Avoid buffer copy in OFPT_PACKET_IN path. When a dpif passes an odp_msg down to ofproto, and ofproto transforms it into an ofp_packet_in to send to the controller, until now this always involved a full copy of the packet inside ofproto. This commit eliminates this copy by ensuring that there is always enough headroom in the ofpbuf that holds the odp_msg to replace it by an ofp_packet_in in-place. From Jean Tourrilhes , with some revisions. --- diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c index 87c9f327..15283b28 100644 --- a/lib/dpif-linux.c +++ b/lib/dpif-linux.c @@ -460,7 +460,8 @@ dpif_linux_recv(struct dpif *dpif_, struct ofpbuf **bufp) int retval; int error; - buf = ofpbuf_new(65536); + buf = ofpbuf_new(65536 + DPIF_RECV_MSG_PADDING); + ofpbuf_reserve(buf, DPIF_RECV_MSG_PADDING); retval = read(dpif->fd, ofpbuf_tail(buf), ofpbuf_tailroom(buf)); if (retval < 0) { error = errno; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 7d31a69b..2f4463e6 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -1280,7 +1280,8 @@ dp_netdev_output_control(struct dp_netdev *dp, const struct ofpbuf *packet, } msg_size = sizeof *header + packet->size; - msg = ofpbuf_new(msg_size); + msg = ofpbuf_new(msg_size + DPIF_RECV_MSG_PADDING); + ofpbuf_reserve(msg, DPIF_RECV_MSG_PADDING); header = ofpbuf_put_uninit(msg, sizeof *header); header->type = queue_no; header->length = msg_size; diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index fddc8ea3..33663ff4 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -21,7 +21,9 @@ * datapath. */ #include +#include "openflow/openflow.h" #include "dpif.h" +#include "util.h" #ifdef __cplusplus extern "C" { @@ -300,8 +302,10 @@ struct dpif_class { /* Attempts to receive a message from 'dpif'. If successful, stores the * message into '*packetp'. The message, if one is received, must begin - * with 'struct odp_msg' as a header. Only messages of the types selected - * with the set_listen_mask member function should be received. + * with 'struct odp_msg' as a header, and must have at least + * DPIF_RECV_MSG_PADDING bytes of headroom (allocated using + * e.g. ofpbuf_reserve()). Only messages of the types selected with the + * set_listen_mask member function should be received. * * This function must not block. If no message is ready to be received * when it is called, it should return EAGAIN without blocking. */ @@ -312,6 +316,14 @@ struct dpif_class { void (*recv_wait)(struct dpif *dpif); }; +/* Minimum number of bytes of headroom for a packet returned by the 'recv' + * member function (see above). This headroom allows "struct odp_msg" to be + * replaced by "struct ofp_packet_in" without copying the buffer. */ +#define DPIF_RECV_MSG_PADDING (sizeof(struct ofp_packet_in) \ + - sizeof(struct odp_msg)) +BUILD_ASSERT_DECL(sizeof(struct ofp_packet_in) > sizeof(struct odp_msg)); +BUILD_ASSERT_DECL(DPIF_RECV_MSG_PADDING % 4 == 0); + extern const struct dpif_class dpif_linux_class; extern const struct dpif_class dpif_netdev_class; diff --git a/lib/dpif.c b/lib/dpif.c index 186f165c..097b38d8 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1015,7 +1015,8 @@ dpif_set_sflow_probability(struct dpif *dpif, uint32_t probability) /* Attempts to receive a message from 'dpif'. If successful, stores the * message into '*packetp'. The message, if one is received, will begin with - * 'struct odp_msg' as a header. Only messages of the types selected with + * 'struct odp_msg' as a header, and will have at least DPIF_RECV_MSG_PADDING + * bytes of headroom. Only messages of the types selected with * dpif_set_listen_mask() will ordinarily be received (but if a message type is * enabled and then later disabled, some stragglers might pop up). * @@ -1026,8 +1027,10 @@ dpif_recv(struct dpif *dpif, struct ofpbuf **packetp) { int error = dpif->dpif_class->recv(dpif, packetp); if (!error) { + struct ofpbuf *buf = *packetp; + + assert(ofpbuf_headroom(buf) >= DPIF_RECV_MSG_PADDING); if (VLOG_IS_DBG_ENABLED()) { - struct ofpbuf *buf = *packetp; struct odp_msg *msg = buf->data; void *payload = msg + 1; size_t payload_len = buf->size - sizeof *msg; diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 6c93aea0..2763efbd 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2009, 2010 Nicira Networks. + * Copyright (c) 2010 Jean Tourrilhes - HP-Labs. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -3993,65 +3994,127 @@ update_used(struct ofproto *p) free(flows); } +/* pinsched callback for sending 'packet' on 'ofconn'. */ static void do_send_packet_in(struct ofpbuf *packet, void *ofconn_) { struct ofconn *ofconn = ofconn_; + + rconn_send_with_limit(ofconn->rconn, packet, + ofconn->packet_in_counter, 100); +} + +/* Takes 'packet', which has been converted with do_convert_to_packet_in(), and + * finalizes its content for sending on 'ofconn', and passes it to 'ofconn''s + * packet scheduler for sending. + * + * If 'clone' is true, the caller retains ownership of 'packet'. Otherwise, + * ownership is transferred to this function. */ +static void +schedule_packet_in(struct ofconn *ofconn, struct ofpbuf *packet, bool clone) +{ struct ofproto *ofproto = ofconn->ofproto; - struct odp_msg *msg = packet->data; - struct ofpbuf payload; - struct ofpbuf *opi; + struct ofp_packet_in *opi = packet->data; + uint16_t in_port = ofp_port_to_odp_port(ntohs(opi->in_port)); + int send_len, trim_size; uint32_t buffer_id; - int send_len; - /* Extract packet payload from 'msg'. */ - payload.data = msg + 1; - payload.size = msg->length - sizeof *msg; - - /* Construct packet-in message. */ - send_len = INT_MAX; - if (msg->type == _ODPL_ACTION_NR) { + /* Get buffer. */ + if (opi->reason == OFPR_ACTION) { buffer_id = UINT32_MAX; + } else if (ofproto->fail_open && fail_open_is_active(ofproto->fail_open)) { + buffer_id = pktbuf_get_null(); } else { - if (ofproto->fail_open && fail_open_is_active(ofproto->fail_open)) { - buffer_id = pktbuf_get_null(); - } else { - buffer_id = pktbuf_save(ofconn->pktbuf, &payload, msg->port); - } - if (buffer_id != UINT32_MAX) { - send_len = ofconn->miss_send_len; - } + struct ofpbuf payload; + payload.data = opi->data; + payload.size = packet->size - offsetof(struct ofp_packet_in, data); + buffer_id = pktbuf_save(ofconn->pktbuf, &payload, in_port); } - opi = make_packet_in(buffer_id, odp_port_to_ofp_port(msg->port), - msg->type, &payload, send_len); - /* Send. */ - rconn_send_with_limit(ofconn->rconn, opi, ofconn->packet_in_counter, 100); + /* Figure out how much of the packet to send. */ + send_len = ntohs(opi->total_len); + if (buffer_id != UINT32_MAX) { + send_len = MIN(send_len, ofconn->miss_send_len); + } - ofpbuf_delete(packet); + /* Adjust packet length and clone if necessary. */ + trim_size = offsetof(struct ofp_packet_in, data) + send_len; + if (clone) { + packet = ofpbuf_clone_data(packet->data, trim_size); + opi = packet->data; + } else { + packet->size = trim_size; + } + + /* Update packet headers. */ + opi->buffer_id = htonl(buffer_id); + update_openflow_length(packet); + + /* Hand over to packet scheduler. It might immediately call into + * do_send_packet_in() or it might buffer it for a while (until a later + * call to pinsched_run()). */ + pinsched_send(ofconn->schedulers[opi->reason], in_port, + packet, do_send_packet_in, ofconn); } +/* Replace struct odp_msg header in 'packet' by equivalent struct + * ofp_packet_in. The odp_msg must have sufficient headroom to do so (e.g. as + * returned by dpif_recv()). + * + * The conversion is not complete: the caller still needs to trim any unneeded + * payload off the end of the buffer, set the length in the OpenFlow header, + * and set buffer_id. Those require us to know the controller settings and so + * must be done on a per-controller basis. */ static void -send_packet_in(struct ofproto *ofproto, struct ofpbuf *packet) +do_convert_to_packet_in(struct ofpbuf *packet) { struct odp_msg *msg = packet->data; + struct ofp_packet_in *opi; + uint8_t reason; + uint16_t total_len; + uint16_t in_port; + + /* Extract relevant header fields */ + reason = (msg->type == _ODPL_ACTION_NR ? OFPR_ACTION : OFPR_NO_MATCH); + total_len = msg->length - sizeof *msg; + in_port = odp_port_to_ofp_port(msg->port); + + /* Repurpose packet buffer by overwriting header. */ + ofpbuf_pull(packet, sizeof(struct odp_msg)); + opi = ofpbuf_push_zeros(packet, offsetof(struct ofp_packet_in, data)); + opi->header.version = OFP_VERSION; + opi->header.type = OFPT_PACKET_IN; + opi->total_len = htons(total_len); + opi->in_port = htons(in_port); + opi->reason = reason; +} + +/* Given 'packet' containing an odp_msg of type _ODPL_ACTION_NR or + * _ODPL_MISS_NR, sends an OFPT_PACKET_IN message to each OpenFlow controller + * as necessary according to their individual configurations. + * + * 'packet' must have sufficient headroom to convert it into a struct + * ofp_packet_in (e.g. as returned by dpif_recv()). + * + * Takes ownership of 'packet'. */ +static void +send_packet_in(struct ofproto *ofproto, struct ofpbuf *packet) +{ struct ofconn *ofconn, *prev; - assert(msg->type == _ODPL_MISS_NR || msg->type == _ODPL_ACTION_NR); + do_convert_to_packet_in(packet); prev = NULL; LIST_FOR_EACH (ofconn, struct ofconn, node, &ofproto->all_conns) { if (ofconn->role != NX_ROLE_SLAVE) { if (prev) { - pinsched_send(prev->schedulers[msg->type], msg->port, - ofpbuf_clone(packet), do_send_packet_in, prev); + schedule_packet_in(prev, packet, true); } prev = ofconn; } } if (prev) { - pinsched_send(prev->schedulers[msg->type], msg->port, - packet, do_send_packet_in, prev); + schedule_packet_in(prev, packet, false); } else { ofpbuf_delete(packet); }