From c3729ee42dc25a8240cee6c0041b7db3e4070414 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 27 May 2011 15:53:49 -0700 Subject: [PATCH] datapath: Further mirror checksum offloading state on old kernels. Older kernels (those before 2.6.22) rely on implicit assumptions to determine checksum offloading status. These assumptions tend to break down when doing switching because it sits in the middle of the transmit and receive path. Newer kernels deal with this problem by adding more explicit information about how to checksum. This replicates that behavior by mirroring the state from newer kernels in private OVS storage on the kernels that lack it. On ingress and egress we then map that state onto the appropriate location for the given kernel and can consistently manipulate it within OVS. Some of this was already done for the checksum type but this makes it more robust and expands it to the checksum start and offset as well. Signed-off-by: Jesse Gross Acked-by: Ben Pfaff --- datapath/checksum.c | 226 +++++++++++++++++++++++----------- datapath/checksum.h | 94 +++++++++----- datapath/datapath.c | 8 +- datapath/datapath.h | 7 +- datapath/tunnel.c | 33 +++-- datapath/vport-internal_dev.c | 7 +- datapath/vport-netdev.c | 15 +-- 7 files changed, 257 insertions(+), 133 deletions(-) diff --git a/datapath/checksum.c b/datapath/checksum.c index e3519491..3a131f4e 100644 --- a/datapath/checksum.c +++ b/datapath/checksum.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Nicira Networks. + * Copyright (c) 2010, 2011 Nicira Networks. * Distributed under the terms of the GNU GPL version 2. * * Significant portions of this file may be copied from parts of the Linux @@ -16,7 +16,82 @@ #include "checksum.h" #include "datapath.h" - /* Types of checksums that we can receive (these all refer to L4 checksums): +#ifdef NEED_CSUM_NORMALIZE + +#if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID) +/* This code is based on skb_checksum_setup() from Xen's net/dev/core.c. We + * can't call this function directly because it isn't exported in all + * versions. */ +static int vswitch_skb_checksum_setup(struct sk_buff *skb) +{ + struct iphdr *iph; + unsigned char *th; + int err = -EPROTO; + __u16 csum_start, csum_offset; + + if (!skb->proto_csum_blank) + return 0; + + if (skb->protocol != htons(ETH_P_IP)) + goto out; + + if (!pskb_may_pull(skb, skb_network_header(skb) + sizeof(struct iphdr) - skb->data)) + goto out; + + iph = ip_hdr(skb); + th = skb_network_header(skb) + 4 * iph->ihl; + + csum_start = th - skb->head; + switch (iph->protocol) { + case IPPROTO_TCP: + csum_offset = offsetof(struct tcphdr, check); + break; + case IPPROTO_UDP: + csum_offset = offsetof(struct udphdr, check); + break; + default: + if (net_ratelimit()) + pr_err("Attempting to checksum a non-TCP/UDP packet, " + "dropping a protocol %d packet", + iph->protocol); + goto out; + } + + if (!pskb_may_pull(skb, th + csum_offset + 2 - skb->data)) + goto out; + + skb->proto_csum_blank = 0; + set_ip_summed(skb, OVS_CSUM_PARTIAL); + set_skb_csum_pointers(skb, csum_start, csum_offset); + + err = 0; + +out: + return err; +} +#else +static int vswitch_skb_checksum_setup(struct sk_buff *skb) +{ + return 0; +} +#endif /* not Xen old style checksums */ + +/* + * compute_ip_summed - map external checksum state onto OVS representation + * + * @skb: Packet to manipulate. + * @xmit: Whether we were on transmit path of network stack. For example, + * this is true for the internal dev vport because it receives skbs + * that passed through dev_queue_xmit() but false for the netdev vport + * because its packets come from netif_receive_skb(). + * + * Older kernels (and various versions of Xen) were not explicit enough about + * checksum offload parameters and rely on a combination of context and + * non standard fields. This deals with all those variations so that we + * can internally manipulate checksum offloads without worrying about kernel + * version. + * + * Types of checksums that we can receive (these all refer to L4 checksums): * 1. CHECKSUM_NONE: Device that did not compute checksum, contains full * (though not verified) checksum in packet but not in skb->csum. Packets * from the bridge local port will also have this type. @@ -58,25 +133,24 @@ * CHECKSUM_PARTIAL, it will be sent with the wrong checksum. However, there * shouldn't be any devices that do this with bridging. */ -#ifdef NEED_CSUM_NORMALIZE -void compute_ip_summed(struct sk_buff *skb, bool xmit) +int compute_ip_summed(struct sk_buff *skb, bool xmit) { /* For our convenience these defines change repeatedly between kernel * versions, so we can't just copy them over... */ switch (skb->ip_summed) { case CHECKSUM_NONE: - OVS_CB(skb)->ip_summed = OVS_CSUM_NONE; + set_ip_summed(skb, OVS_CSUM_NONE); break; case CHECKSUM_UNNECESSARY: - OVS_CB(skb)->ip_summed = OVS_CSUM_UNNECESSARY; + set_ip_summed(skb, OVS_CSUM_UNNECESSARY); break; #ifdef CHECKSUM_HW /* In theory this could be either CHECKSUM_PARTIAL or CHECKSUM_COMPLETE. * However, on the receive side we should only get CHECKSUM_PARTIAL * packets from Xen, which uses some special fields to represent this - * (see below). Since we can only make one type work, pick the one - * that actually happens in practice. + * (see vswitch_skb_checksum_setup()). Since we can only make one type work, + * pick the one that actually happens in practice. * * On the transmit side (basically after skb_checksum_setup() * has been run or on internal dev transmit), packets with @@ -84,87 +158,101 @@ void compute_ip_summed(struct sk_buff *skb, bool xmit) */ case CHECKSUM_HW: if (!xmit) - OVS_CB(skb)->ip_summed = OVS_CSUM_COMPLETE; + set_ip_summed(skb, OVS_CSUM_COMPLETE); else - OVS_CB(skb)->ip_summed = OVS_CSUM_PARTIAL; - + set_ip_summed(skb, OVS_CSUM_PARTIAL); break; #else case CHECKSUM_COMPLETE: - OVS_CB(skb)->ip_summed = OVS_CSUM_COMPLETE; + set_ip_summed(skb, OVS_CSUM_COMPLETE); break; case CHECKSUM_PARTIAL: - OVS_CB(skb)->ip_summed = OVS_CSUM_PARTIAL; + set_ip_summed(skb, OVS_CSUM_PARTIAL); break; #endif } -#if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID) - /* Xen has a special way of representing CHECKSUM_PARTIAL on older - * kernels. It should not be set on the transmit path though. - */ - if (skb->proto_csum_blank) - OVS_CB(skb)->ip_summed = OVS_CSUM_PARTIAL; + OVS_CB(skb)->csum_start = skb_headroom(skb) + skb_transport_offset(skb); - WARN_ON_ONCE(skb->proto_csum_blank && xmit); -#endif + return vswitch_skb_checksum_setup(skb); } -u8 get_ip_summed(struct sk_buff *skb) +/* + * forward_ip_summed - map internal checksum state back onto native kernel fields + * + * @skb: Packet to manipulate. + * @xmit: Whether we are about send on the transmit path the network stack. This + * follows the same logic as the @xmit field in compute_ip_summed(). + * Generally, a given vport will have opposite values for @xmit passed to these + * two functions. + * + * When a packet is about to egress from OVS take our internal fields (including + * any modifications we have made) and recreate the correct representation for + * this kernel. This may do things like change the transport header offset. + */ +void forward_ip_summed(struct sk_buff *skb, bool xmit) { - return OVS_CB(skb)->ip_summed; -} -#endif /* NEED_CSUM_NORMALIZE */ - + switch(get_ip_summed(skb)) { + case OVS_CSUM_NONE: + skb->ip_summed = CHECKSUM_NONE; + break; + case OVS_CSUM_UNNECESSARY: + skb->ip_summed = CHECKSUM_UNNECESSARY; #if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID) -/* This code is based on skb_checksum_setup() from Xen's net/dev/core.c. We - * can't call this function directly because it isn't exported in all - * versions. */ -int vswitch_skb_checksum_setup(struct sk_buff *skb) -{ - struct iphdr *iph; - unsigned char *th; - int err = -EPROTO; - __u16 csum_start, csum_offset; - - if (!skb->proto_csum_blank) - return 0; - - if (skb->protocol != htons(ETH_P_IP)) - goto out; - - if (!pskb_may_pull(skb, skb_network_header(skb) + sizeof(struct iphdr) - skb->data)) - goto out; - - iph = ip_hdr(skb); - th = skb_network_header(skb) + 4 * iph->ihl; - - csum_start = th - skb->head; - switch (iph->protocol) { - case IPPROTO_TCP: - csum_offset = offsetof(struct tcphdr, check); + skb->proto_data_valid = 1; +#endif break; - case IPPROTO_UDP: - csum_offset = offsetof(struct udphdr, check); +#ifdef CHECKSUM_HW + case OVS_CSUM_COMPLETE: + if (!xmit) + skb->ip_summed = CHECKSUM_HW; + else + skb->ip_summed = CHECKSUM_NONE; break; - default: - if (net_ratelimit()) - pr_err("Attempting to checksum a non-TCP/UDP packet, " - "dropping a protocol %d packet", - iph->protocol); - goto out; + case OVS_CSUM_PARTIAL: + if (!xmit) { + skb->ip_summed = CHECKSUM_UNNECESSARY; +#if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID) + skb->proto_csum_blank = 1; +#endif + } else { + skb->ip_summed = CHECKSUM_HW; + } + break; +#else + case OVS_CSUM_COMPLETE: + skb->ip_summed = CHECKSUM_COMPLETE; + break; + case OVS_CSUM_PARTIAL: + skb->ip_summed = CHECKSUM_PARTIAL; + break; +#endif } - if (!pskb_may_pull(skb, th + csum_offset + 2 - skb->data)) - goto out; + if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) + skb_set_transport_header(skb, OVS_CB(skb)->csum_start - skb_headroom(skb)); +} - skb->ip_summed = CHECKSUM_PARTIAL; - skb->proto_csum_blank = 0; - set_skb_csum_pointers(skb, csum_start, csum_offset); +u8 get_ip_summed(struct sk_buff *skb) +{ + return OVS_CB(skb)->ip_summed; +} - err = 0; +void set_ip_summed(struct sk_buff *skb, u8 ip_summed) +{ + OVS_CB(skb)->ip_summed = ip_summed; +} -out: - return err; +void get_skb_csum_pointers(const struct sk_buff *skb, u16 *csum_start, + u16 *csum_offset) +{ + *csum_start = OVS_CB(skb)->csum_start; + *csum_offset = skb->csum; } -#endif /* CONFIG_XEN && HAVE_PROTO_DATA_VALID */ + +void set_skb_csum_pointers(struct sk_buff *skb, u16 csum_start, u16 csum_offset) +{ + OVS_CB(skb)->csum_start = csum_start; + skb->csum = csum_offset; +} +#endif /* NEED_CSUM_NORMALIZE */ diff --git a/datapath/checksum.h b/datapath/checksum.h index abd46150..4f85104a 100644 --- a/datapath/checksum.h +++ b/datapath/checksum.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Nicira Networks. + * Copyright (c) 2010, 2011 Nicira Networks. * Distributed under the terms of the GNU GPL version 2. * * Significant portions of this file may be copied from parts of the Linux @@ -28,66 +28,50 @@ enum csum_type { }; #ifdef NEED_CSUM_NORMALIZE -void compute_ip_summed(struct sk_buff *skb, bool xmit); +int compute_ip_summed(struct sk_buff *skb, bool xmit); +void forward_ip_summed(struct sk_buff *skb, bool xmit); u8 get_ip_summed(struct sk_buff *skb); +void set_ip_summed(struct sk_buff *skb, u8 ip_summed); +void get_skb_csum_pointers(const struct sk_buff *skb, u16 *csum_start, + u16 *csum_offset); +void set_skb_csum_pointers(struct sk_buff *skb, u16 csum_start, u16 csum_offset); #else -static inline void compute_ip_summed(struct sk_buff *skb, bool xmit) { } -static inline u8 get_ip_summed(struct sk_buff *skb) +static inline int compute_ip_summed(struct sk_buff *skb, bool xmit) { - return skb->ip_summed; + return 0; } -#endif -/* This function closely resembles skb_forward_csum() used by the bridge. It - * is slightly different because we are only concerned with bridging and not - * other types of forwarding and can get away with slightly more optimal - * behavior. - */ -static inline void forward_ip_summed(struct sk_buff *skb) +static inline void forward_ip_summed(struct sk_buff *skb, bool xmit) { } + +static inline u8 get_ip_summed(struct sk_buff *skb) { -#ifdef CHECKSUM_HW - if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; -#endif + return skb->ip_summed; } -#if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID) -int vswitch_skb_checksum_setup(struct sk_buff *skb); -#else -static inline int vswitch_skb_checksum_setup(struct sk_buff *skb) +static inline void set_ip_summed(struct sk_buff *skb, u8 ip_summed) { - return 0; + skb->ip_summed = ip_summed; } -#endif static inline void get_skb_csum_pointers(const struct sk_buff *skb, u16 *csum_start, u16 *csum_offset) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) *csum_start = skb->csum_start; *csum_offset = skb->csum_offset; -#else - *csum_start = skb_headroom(skb) + skb_transport_offset(skb); - *csum_offset = skb->csum; -#endif } static inline void set_skb_csum_pointers(struct sk_buff *skb, u16 csum_start, u16 csum_offset) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) skb->csum_start = csum_start; skb->csum_offset = csum_offset; -#else - skb_set_transport_header(skb, csum_start - skb_headroom(skb)); - skb->csum = csum_offset; -#endif } +#endif -#if defined(NEED_CSUM_NORMALIZE) || LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) /* This is really compatibility code that belongs in the compat directory. * However, it needs access to our normalized checksum values, so put it here. */ +#if defined(NEED_CSUM_NORMALIZE) || LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) #define inet_proto_csum_replace4 rpl_inet_proto_csum_replace4 static inline void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, @@ -107,4 +91,48 @@ static inline void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, } #endif +#ifdef NEED_CSUM_NORMALIZE +static inline void update_csum_start(struct sk_buff *skb, int delta) +{ + if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) { + u16 csum_start, csum_offset; + + get_skb_csum_pointers(skb, &csum_start, &csum_offset); + set_skb_csum_pointers(skb, csum_start + delta, csum_offset); + } +} + +static inline int rpl_pskb_expand_head(struct sk_buff *skb, int nhead, + int ntail, gfp_t gfp_mask) +{ + int err; + int old_headroom = skb_headroom(skb); + + err = pskb_expand_head(skb, nhead, ntail, gfp_mask); + if (unlikely(err)) + return err; + + update_csum_start(skb, skb_headroom(skb) - old_headroom); + + return 0; +} +#define pskb_expand_head rpl_pskb_expand_head + +static inline unsigned char *rpl__pskb_pull_tail(struct sk_buff *skb, + int delta) +{ + unsigned char *ret; + int old_headroom = skb_headroom(skb); + + ret = __pskb_pull_tail(skb, delta); + if (unlikely(!ret)) + return ret; + + update_csum_start(skb, skb_headroom(skb) - old_headroom); + + return ret; +} +#define __pskb_pull_tail rpl__pskb_pull_tail +#endif + #endif /* checksum.h */ diff --git a/datapath/datapath.c b/datapath/datapath.c index 534ab914..566aeed6 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -399,11 +399,7 @@ int dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_i WARN_ON_ONCE(skb_shared(skb)); - forward_ip_summed(skb); - - err = vswitch_skb_checksum_setup(skb); - if (err) - goto err_kfree_skb; + forward_ip_summed(skb, true); /* Break apart GSO packets into their component pieces. Otherwise * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */ @@ -424,8 +420,6 @@ int dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_i return 0; -err_kfree_skb: - kfree_skb(skb); err: local_bh_disable(); stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id()); diff --git a/datapath/datapath.h b/datapath/datapath.h index a0649064..15a98989 100644 --- a/datapath/datapath.h +++ b/datapath/datapath.h @@ -102,9 +102,11 @@ struct datapath { * struct ovs_skb_cb - OVS data in skb CB * @vport: The datapath port on which the skb entered the switch. * @flow: The flow associated with this packet. May be %NULL if no flow. + * @tun_id: ID of the tunnel that encapsulated this packet. It is 0 if the * @ip_summed: Consistently stores L4 checksumming status across different * kernel versions. - * @tun_id: ID of the tunnel that encapsulated this packet. It is 0 if the + * @csum_start: Stores the offset from which to start checksumming independent + * of the transport header on all kernel versions. * packet was not received on a tunnel. * @vlan_tci: Provides a substitute for the skb->vlan_tci field on kernels * before 2.6.27. @@ -112,10 +114,11 @@ struct datapath { struct ovs_skb_cb { struct vport *vport; struct sw_flow *flow; + __be64 tun_id; #ifdef NEED_CSUM_NORMALIZE enum csum_type ip_summed; + u16 csum_start; #endif - __be64 tun_id; #ifdef NEED_VLAN_FIELD u16 vlan_tci; #endif diff --git a/datapath/tunnel.c b/datapath/tunnel.c index 2bf61597..5c05c49e 100644 --- a/datapath/tunnel.c +++ b/datapath/tunnel.c @@ -450,9 +450,13 @@ void tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos) secpath_reset(skb); ecn_decapsulate(skb, tos); - compute_ip_summed(skb, false); vlan_set_tci(skb, 0); + if (unlikely(compute_ip_summed(skb, false))) { + kfree_skb(skb); + return; + } + vport_receive(vport, skb); } @@ -718,7 +722,11 @@ bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutab (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) OVS_CB(nskb)->tun_id = flow_key; - compute_ip_summed(nskb, false); + if (unlikely(compute_ip_summed(nskb, false))) { + kfree_skb(nskb); + return false; + } + vport_receive(vport, nskb); return true; @@ -1053,12 +1061,6 @@ static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom; int err; - forward_ip_summed(skb); - - err = vswitch_skb_checksum_setup(skb); - if (unlikely(err)) - goto error_free; - min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len + mutable->tunnel_hlen + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); @@ -1073,6 +1075,8 @@ static struct sk_buff *handle_offloads(struct sk_buff *skb, goto error_free; } + forward_ip_summed(skb, true); + if (skb_is_gso(skb)) { struct sk_buff *nskb; @@ -1084,7 +1088,7 @@ static struct sk_buff *handle_offloads(struct sk_buff *skb, } skb = nskb; - } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + } else if (get_ip_summed(skb) == OVS_CSUM_PARTIAL) { /* Pages aren't locked and could change at any time. * If this happens after we compute the checksum, the * checksum will be wrong. We linearize now to avoid @@ -1099,8 +1103,9 @@ static struct sk_buff *handle_offloads(struct sk_buff *skb, err = skb_checksum_help(skb); if (unlikely(err)) goto error_free; - } else if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + } + + set_ip_summed(skb, OVS_CSUM_NONE); return skb; @@ -1295,8 +1300,12 @@ int tnl_send(struct vport *vport, struct sk_buff *skb) ip_send_check(iph); if (cache_vport) { + if (unlikely(compute_ip_summed(skb, true))) { + kfree_skb(skb); + goto next; + } + OVS_CB(skb)->flow = cache->flow; - compute_ip_summed(skb, true); vport_receive(cache_vport, skb); sent_len += orig_len; } else { diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c index fff4f4e5..b7bcbce6 100644 --- a/datapath/vport-internal_dev.c +++ b/datapath/vport-internal_dev.c @@ -73,7 +73,11 @@ static int internal_dev_mac_addr(struct net_device *dev, void *p) /* Called with rcu_read_lock and bottom-halves disabled. */ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { - compute_ip_summed(skb, true); + if (unlikely(compute_ip_summed(skb, true))) { + kfree_skb(skb); + return 0; + } + vlan_copy_skb_tci(skb); OVS_CB(skb)->flow = NULL; @@ -252,6 +256,7 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) skb->dev = netdev; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); + forward_ip_summed(skb, false); if (in_interrupt()) netif_rx(skb); diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c index 3bab666c..bc3108b9 100644 --- a/datapath/vport-netdev.c +++ b/datapath/vport-netdev.c @@ -284,7 +284,11 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_warn_if_lro(skb); skb_push(skb, ETH_HLEN); - compute_ip_summed(skb, false); + + if (unlikely(compute_ip_summed(skb, false))) { + kfree_skb(skb); + return; + } vlan_copy_skb_tci(skb); vport_receive(vport, skb); @@ -309,22 +313,15 @@ static int netdev_send(struct vport *vport, struct sk_buff *skb) int len; skb->dev = netdev_vport->dev; - forward_ip_summed(skb); + forward_ip_summed(skb, true); if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev)) { - int err; int features = 0; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) features = skb->dev->features & skb->dev->vlan_features; #endif - err = vswitch_skb_checksum_setup(skb); - if (unlikely(err)) { - kfree_skb(skb); - return 0; - } - if (!vlan_tso) features &= ~(NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO | NETIF_F_FSO); -- 2.30.2