From 5214f5c4e95b843b47b047139764d6f4af524785 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 16 Aug 2010 10:32:41 -0400 Subject: [PATCH] datapath: Add support for tunnel fragmentation. Up until now it was assumed that encapsulated packets larger than the MTU would be fragmented by the IP stack. However, some tunneling protocols provide their own fragmentation mechanism. This adds the necessary support to the generic tunnel code to support fragmentation. Signed-off-by: Jesse Gross --- datapath/tunnel.c | 56 ++++++++++++++++++++++++++++++++------------ datapath/tunnel.h | 27 +++++++++++++++++---- datapath/vport-gre.c | 15 +++++++++--- 3 files changed, 76 insertions(+), 22 deletions(-) diff --git a/datapath/tunnel.c b/datapath/tunnel.c index 3f25c9b4..6fa369be 100644 --- a/datapath/tunnel.c +++ b/datapath/tunnel.c @@ -680,29 +680,48 @@ static int build_packet(struct vport *vport, const struct tnl_mutable_config *mu new_iph->frag_off = frag_off; ip_select_ident(new_iph, &rt_dst(rt), NULL); - tnl_vport->tnl_ops->build_header(skb, vport, mutable); + memset(&IPCB(skb)->opt, 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags = 0; - /* Allow our local IP stack to fragment the outer packet even if the - * DF bit is set as a last resort. */ - skb->local_df = 1; + skb = tnl_vport->tnl_ops->build_header(skb, vport, mutable, &rt_dst(rt)); + if (unlikely(!skb)) + goto error; - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags = 0; + while (skb) { + struct sk_buff *next = skb->next; + int frag_len = skb->len - mutable->tunnel_hlen; - err = ip_local_out(skb); - if (likely(net_xmit_eval(err) == 0)) - return orig_len; - else { - vport_record_error(vport, VPORT_E_TX_ERROR); - return 0; - } + skb->next = NULL; + + err = ip_local_out(skb); + if (unlikely(net_xmit_eval(err) != 0)) { + orig_len -= frag_len; + skb = next; + goto free_frags; + } + + skb = next; + }; + + return orig_len; error_free: kfree_skb(skb); error: - vport_record_error(vport, VPORT_E_TX_DROPPED); - return 0; +free_frags: + /* + * There's no point in continuing to send fragments once one has been + * dropped so just free the rest. This may help improve the congestion + * that caused the first packet to be dropped. + */ + while (skb) { + struct sk_buff *next = skb->next; + orig_len -= skb->len - mutable->tunnel_hlen; + kfree_skb(skb); + skb = next; + }; + return orig_len; } int tnl_send(struct vport *vport, struct sk_buff *skb) @@ -847,6 +866,9 @@ int tnl_send(struct vport *vport, struct sk_buff *skb) skb = next_skb; } while (skb); + if (unlikely(orig_len == 0)) + vport_record_error(vport, VPORT_E_TX_DROPPED); + return orig_len; error_free: @@ -914,6 +936,7 @@ struct vport *tnl_create(const char *name, const void __user *config, { struct vport *vport; struct tnl_vport *tnl_vport; + int initial_frag_id; int err; vport = vport_alloc(sizeof(struct tnl_vport), vport_ops); @@ -936,6 +959,9 @@ struct vport *tnl_create(const char *name, const void __user *config, vport_gen_rand_ether_addr(tnl_vport->mutable->eth_addr); tnl_vport->mutable->mtu = ETH_DATA_LEN; + get_random_bytes(&initial_frag_id, sizeof(int)); + atomic_set(&tnl_vport->frag_id, initial_frag_id); + err = set_config(config, tnl_ops, NULL, tnl_vport->mutable); if (err) goto error_free_mutable; diff --git a/datapath/tunnel.h b/datapath/tunnel.h index 89e73bac..92963d76 100644 --- a/datapath/tunnel.h +++ b/datapath/tunnel.h @@ -13,8 +13,10 @@ #include "table.h" #include "vport.h" -/* The absolute minimum fragment size. Note that there are many other - * definitions of the minimum MTU. */ +/* + * The absolute minimum fragment size. Note that there are many other + * definitions of the minimum MTU. + */ #define IP_MIN_MTU 68 /* @@ -47,9 +49,24 @@ struct tnl_ops { u32 tunnel_type; u8 ipproto; + /* + * Returns the length of the tunnel header you will add in + * build_header() (i.e. excludes the IP header). Returns a negative + * error code if the configuration is invalid. + */ int (*hdr_len)(const struct tnl_port_config *); - void (*build_header)(struct sk_buff *, const struct vport *, - const struct tnl_mutable_config *); + + /* + * Returns a linked list of SKBs with tunnel headers (multiple + * packets may be generated in the event of fragmentation). Space + * will have already been allocated at the start of the packet equal + * to sizeof(struct iphdr) + value returned by hdr_len(). The IP + * header will have already been constructed. + */ + struct sk_buff *(*build_header)(struct sk_buff *, + const struct vport *, + const struct tnl_mutable_config *, + struct dst_entry *); }; struct tnl_vport { @@ -61,6 +78,8 @@ struct tnl_vport { /* Protected by RCU. */ struct tnl_mutable_config *mutable; + + atomic_t frag_id; }; int tnl_init(void); diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c index 31d2d4f6..223644e2 100644 --- a/datapath/vport-gre.c +++ b/datapath/vport-gre.c @@ -48,9 +48,10 @@ static int gre_hdr_len(const struct tnl_port_config *port_config) return len; } -static void gre_build_header(struct sk_buff *skb, - const struct vport *vport, - const struct tnl_mutable_config *mutable) +static struct sk_buff *gre_build_header(struct sk_buff *skb, + const struct vport *vport, + const struct tnl_mutable_config *mutable, + struct dst_entry *dst) { struct gre_base_hdr *greh = (struct gre_base_hdr *)skb_transport_header(skb); __be32 *options = (__be32 *)(skb_network_header(skb) + mutable->tunnel_hlen @@ -81,6 +82,14 @@ static void gre_build_header(struct sk_buff *skb, skb->len - sizeof(struct iphdr), 0)); } + + /* + * Allow our local IP stack to fragment the outer packet even if the + * DF bit is set as a last resort. + */ + skb->local_df = 1; + + return skb; } static int parse_header(struct iphdr *iph, __be16 *flags, __be32 *key) -- 2.30.2