datapath: Break up GSO packets before sending to userspace.
authorBen Pfaff <blp@nicira.com>
Thu, 30 Apr 2009 20:46:44 +0000 (13:46 -0700)
committerBen Pfaff <blp@nicira.com>
Thu, 30 Apr 2009 20:46:44 +0000 (13:46 -0700)
On a Xen host, over-MTU GSO packets from virtual machines can end up sent
down by the virtual switch to userspace. This happens, for example, if a
TCP flow has a long enough "pause" that the datapath flow times out.  When
this happens, the packet is not marked as GSO when secchan sends it back up
via dpif_execute(), and the packet is then discarded in dp_xmit_skb() as
too large.

This commit solves the problem by breaking GSO packets into MTU-size pieces
before passing them along to userspace.

Tested by running "netperf" between two VMs on different boxes and running
"dpctl dp-del-flows" on the appropriate datapath a few times in the middle
and seeing that the total bandwidth didn't change much.  Verified that
packets were actually being broken up by adding a printk call inside the
"if (skb_is_gso())" block.

Thanks to Justin and Keith for review.

Bug #1133.

datapath/datapath.c

index 6c8630b9e3ea62f183a9ec7f59e5e1ce98e39659..2be82df0b0d94e71f314f27e4d0ad3df4af19abc 100644 (file)
@@ -682,7 +682,7 @@ dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no,
        queue = &dp->queues[queue_no];
        err = -ENOBUFS;
        if (skb_queue_len(queue) >= DP_MAX_QUEUE_LEN)
-               goto err;
+               goto err_kfree_skb;
 
        /* If a checksum-deferred packet is forwarded to the controller,
         * correct the pointers and checksum.  This happens on a regular basis
@@ -693,7 +693,7 @@ dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no,
         */
        err = skb_checksum_setup(skb);
        if (err)
-               goto err;
+               goto err_kfree_skb;
 #ifndef CHECKSUM_HW
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                WARN_ON_ONCE(1);
@@ -708,38 +708,72 @@ dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no,
 #endif
                err = skb_checksum_help(skb);
                if (err)
-                       goto err;
+                       goto err_kfree_skb;
        }
 #else
        if (skb->ip_summed == CHECKSUM_HW) {
                err = skb_checksum_help(skb, 0);
                if (err)
-                       goto err;
+                       goto err_kfree_skb;
        }
 #endif
 
-       err = skb_cow(skb, sizeof *header);
-       if (err)
-               goto err;
-
-       header = (struct odp_msg*)__skb_push(skb, sizeof *header);
-       header->type = queue_no;
-       header->length = skb->len;
-       header->port = (skb->dev && skb->dev->br_port
-                       ? skb->dev->br_port->port_no
-                       : ODPP_LOCAL);
-       header->reserved = 0;
-       header->arg = arg;
-       skb_queue_tail(queue, skb);
+       /* Break apart GSO packets into their component pieces.  Otherwise
+        * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
+       if (skb_is_gso(skb)) {
+               struct sk_buff *nskb = skb_gso_segment(skb, 0);
+               if (nskb) {
+                       kfree_skb(skb);
+                       skb = nskb;
+                       if (unlikely(IS_ERR(skb))) {
+                               err = PTR_ERR(skb);
+                               goto err;
+                       }
+               } else {
+                       /* XXX This case might not be possible.  It's hard to
+                        * tell from the skb_gso_segment() code and comment. */
+               }
+       }
+
+       /* Append each packet to queue.  There will be only one packet unless
+        * we broke up a GSO packet above. */
+       do {
+               struct sk_buff *nskb = skb->next;
+               skb->next = NULL;
+
+               err = skb_cow(skb, sizeof *header);
+               if (err) {
+                       while (nskb) {
+                               kfree_skb(skb);
+                               skb = nskb;
+                               nskb = skb->next;
+                       }
+                       goto err_kfree_skb;
+               }
+
+               header = (struct odp_msg*)__skb_push(skb, sizeof *header);
+               header->type = queue_no;
+               header->length = skb->len;
+               header->port = (skb->dev && skb->dev->br_port
+                               ? skb->dev->br_port->port_no
+                               : ODPP_LOCAL);
+               header->reserved = 0;
+               header->arg = arg;
+               skb_queue_tail(queue, skb);
+
+               skb = nskb;
+       } while (skb);
+
        wake_up_interruptible(&dp->waitqueue);
        return 0;
 
+err_kfree_skb:
+       kfree_skb(skb);
 err:
        stats = percpu_ptr(dp->stats_percpu, get_cpu());
        stats->n_lost++;
        put_cpu();
 
-       kfree_skb(skb);
        return err;
 }