From f4267e344a9373a4efefff8f8f5b85f532d223e1 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 22 Apr 2010 08:11:50 -0400 Subject: [PATCH] datapath: Enable offloading on internal devices. Enables checksum offloading, scatter/gather, and TSO on internal devices. While these optimizations were not previously enabled on internal ports we already could receive these types of packets from Xen guests. This has the obvious performance benefits when these packets can be passed directly to hardware. There is also a more subtle benefit for GRE on Xen. GRE packets pass through OVS twice - once before encapsulation and once after encapsulation, moving through an internal device in the process. If it is a SG packet (as is common on Xen), a copy was necessary to linearize for the internal device. However, Xen uses the memory allocator to track packets so when the original packet is freed after the copy netback notifies the guest that the packet has been sent, despite the fact that it is actually sitting in the transmit queue. The guest then sends packets as fast as the CPU can handle, overflowing the transmit queue. By enabling SG on the internal device, we avoid the copy and keep the accounting correct. In certain circumstances this patch can decrease performance for TCP. TCP has its own mechanism for tracking in-flight packets and therefore does not benefit from the corrected socket accounting. However, certain NICs do not like SG when it is not being used for TSO (these packets can no longer be handled by TSO after GRE encapsulation). These NICs presumably enable SG even though they can't handle it well because TSO requires SG. Tested controllers (all 1G): Marvell 88E8053 (large performance hit) Broadcom BCM5721 (small performance hit) Intel 82571EB (no change) --- datapath/datapath.c | 33 ++++++++++++++------------------- datapath/vport-gre.c | 3 +++ datapath/vport-internal_dev.c | 18 ++++++++++++------ datapath/vport-netdev.c | 1 + datapath/vport.c | 4 +++- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/datapath/datapath.c b/datapath/datapath.c index 71a44e6f..79a1f632 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -522,7 +522,6 @@ void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb) skb_warn_if_lro(skb); OVS_CB(skb)->dp_port = p; - compute_ip_summed(skb, false); /* BHs are off so we don't have to use get_cpu()/put_cpu() here. */ stats = percpu_ptr(dp->stats_percpu, smp_processor_id()); @@ -625,8 +624,7 @@ out: * be computed if it is sent off box. Unfortunately on earlier kernels, * this case is impossible to distinguish from #2, despite having opposite * meanings. Xen adds an extra field on earlier kernels (see #4) in order - * to distinguish the different states. The only real user of this type - * with bridging is Xen (on later kernels). + * to distinguish the different states. * 4. CHECKSUM_UNNECESSARY (with proto_csum_blank true): This packet was * generated locally by a Xen DomU and has a partial checksum. If it is * handled on this machine (Dom0 or DomU), then the checksum will not be @@ -650,12 +648,7 @@ out: * packet is processed by the local IP stack, in which case it will need to * be reverified). If we receive a packet with CHECKSUM_HW that really means * CHECKSUM_PARTIAL, it will be sent with the wrong checksum. However, there - * shouldn't be any devices that do this with bridging. - * - * The bridge has similar behavior and this function closely resembles - * skb_forward_csum(). It is slightly different because we are only concerned - * with bridging and not other types of forwarding and can get away with - * slightly more optimal behavior.*/ + * shouldn't be any devices that do this with bridging. */ void compute_ip_summed(struct sk_buff *skb, bool xmit) { @@ -670,14 +663,14 @@ compute_ip_summed(struct sk_buff *skb, bool xmit) break; #ifdef CHECKSUM_HW /* In theory this could be either CHECKSUM_PARTIAL or CHECKSUM_COMPLETE. - * However, we should only get CHECKSUM_PARTIAL packets from Xen, which - * uses some special fields to represent this (see below). Since we - * can only make one type work, pick the one that actually happens in - * practice. + * However, on the receive side we should only get CHECKSUM_PARTIAL + * packets from Xen, which uses some special fields to represent this + * (see below). Since we can only make one type work, pick the one + * that actually happens in practice. * - * The one exception to this is if we are on the transmit path - * (basically after skb_checksum_setup() has been run) the type has - * already been converted, so we should stay with that. */ + * On the transmit side (basically after skb_checksum_setup() + * has been run or on internal dev transmit), packets with + * CHECKSUM_COMPLETE aren't generated, so assume CHECKSUM_PARTIAL. */ case CHECKSUM_HW: if (!xmit) OVS_CB(skb)->ip_summed = OVS_CSUM_COMPLETE; @@ -710,6 +703,10 @@ compute_ip_summed(struct sk_buff *skb, bool xmit) #endif } +/* This function closely resembles skb_forward_csum() used by the bridge. It + * is slightly different because we are only concerned with bridging and not + * other types of forwarding and can get away with slightly more optimal + * behavior.*/ void forward_ip_summed(struct sk_buff *skb) { @@ -741,9 +738,7 @@ queue_control_packets(struct sk_buff *skb, struct sk_buff_head *queue, skb->next = NULL; /* If a checksum-deferred packet is forwarded to the - * controller, correct the pointers and checksum. This happens - * on a regular basis only on Xen, on which VMs can pass up - * packets that do not have their checksum computed. + * controller, correct the pointers and checksum. */ err = vswitch_skb_checksum_setup(skb); if (err) diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c index fdf526e2..b7afd8f7 100644 --- a/datapath/vport-gre.c +++ b/datapath/vport-gre.c @@ -534,6 +534,7 @@ send_frag_needed(struct vport *vport, const struct mutable_config *mutable, mutable->port_config.flags & GRE_F_OUT_KEY_ACTION) OVS_CB(nskb)->tun_id = flow_key; + compute_ip_summed(nskb, false); vport_receive(vport, nskb); return true; @@ -902,6 +903,8 @@ gre_rcv(struct sk_buff *skb) OVS_CB(skb)->tun_id = 0; skb_push(skb, ETH_HLEN); + compute_ip_summed(skb, false); + vport_receive(vport, skb); return 0; diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c index 88b6cc19..d23b4c3e 100644 --- a/datapath/vport-internal_dev.c +++ b/datapath/vport-internal_dev.c @@ -99,6 +99,8 @@ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) lb_stats->tx_bytes += skb->len; skb_reset_mac_header(skb); + compute_ip_summed(skb, true); + rcu_read_lock_bh(); vport_receive(vport, skb); rcu_read_unlock_bh(); @@ -129,11 +131,14 @@ static void internal_dev_getinfo(struct net_device *netdev, } static struct ethtool_ops internal_dev_ethtool_ops = { - .get_drvinfo = internal_dev_getinfo, - .get_link = ethtool_op_get_link, - .get_sg = ethtool_op_get_sg, - .get_tx_csum = ethtool_op_get_tx_csum, - .get_tso = ethtool_op_get_tso, + .get_drvinfo = internal_dev_getinfo, + .get_link = ethtool_op_get_link, + .get_sg = ethtool_op_get_sg, + .set_sg = ethtool_op_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = ethtool_op_set_tx_hw_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = ethtool_op_set_tso, }; static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu) @@ -230,7 +235,8 @@ do_setup(struct net_device *netdev) netdev->tx_queue_len = 0; netdev->flags = IFF_BROADCAST | IFF_MULTICAST; - netdev->features = NETIF_F_LLTX; /* XXX other features? */ + netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_HIGHDMA + | NETIF_F_HW_CSUM | NETIF_F_TSO; vport_gen_ether_addr(netdev->dev_addr); } diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c index 980df01f..8cc44210 100644 --- a/datapath/vport-netdev.c +++ b/datapath/vport-netdev.c @@ -331,6 +331,7 @@ netdev_port_receive(struct net_bridge_port *p, struct sk_buff *skb) /* Push the Ethernet header back on. */ skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); + compute_ip_summed(skb, false); vport_receive(vport, skb); } diff --git a/datapath/vport.c b/datapath/vport.c index 9ed7cd16..5e164a91 100644 --- a/datapath/vport.c +++ b/datapath/vport.c @@ -1038,7 +1038,9 @@ vport_get_mtu(const struct vport *vport) * @skb: skb that was received * * Must be called with rcu_read_lock and bottom halves disabled. The packet - * cannot be shared and skb->data should point to the Ethernet header. + * cannot be shared and skb->data should point to the Ethernet header. The + * caller must have already called compute_ip_summed() to initialize the + * checksumming fields. */ void vport_receive(struct vport *vport, struct sk_buff *skb) -- 2.30.2