From: Ben Pfaff Date: Tue, 1 Feb 2011 19:32:06 +0000 (-0800) Subject: datapath: Consider tunnels to have no MTU, fixing jumbo frame support. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f915f1a8ca180828983ef22cf2fd21b8f010b972;p=openvswitch datapath: Consider tunnels to have no MTU, fixing jumbo frame support. Until now, tunnel vports have had a specific MTU, in the same way that ordinary network devices have an MTU, but treating them this way does not always make sense. For example, consider a datapath that has three ports: the local port, a GRE tunnel to another host, and a physical port. If the physical port is configured with a jumbo MTU, it should be possible to send jumbo packets across the tunnel: the tunnel can do fragmentation or the physical port traversed by the tunnel might have a jumbo MTU. However, until now, tunnels always had a 1500-byte MTU by default. It could be adjusted using ODP_VPORT_MTU_SET, but nothing actually did this. One alternative would be to make ovs-vswitchd able to set the vport's MTU. This commit, however, takes a different approach, of dropping the concept of MTU entirely for tunnel vports. This also solves the problem described above, without making any additional work for anyone. I tested that, without this change, I could not send 1600-byte "pings" between two machines whose NICs had 2000-byte MTUs that were connected to vswitches that were in turn connected over GRE tunnels with the default 1500-byte MTU. With this change, it worked OK, regardless of the MTU of the network traversed by the GRE tunnel. This patch also makes "patch" ports MTU-less. It might make sense to remove vport_set_mtu() and the associated callback now, since ordinary network devices are the only vports that support it now. Signed-off-by: Ben Pfaff Suggested-by: Jesse Gross Acked-by: Jesse Gross Bug #3728. --- diff --git a/datapath/datapath.c b/datapath/datapath.c index 74f276b5..ba32e37f 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -773,6 +773,8 @@ int dp_min_mtu(const struct datapath *dp) continue; dev_mtu = vport_get_mtu(p); + if (!dev_mtu) + continue; if (!mtu || dev_mtu < mtu) mtu = dev_mtu; } @@ -1582,6 +1584,7 @@ static int odp_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, struct odp_header *odp_header; struct nlattr *nla; int ifindex, iflink; + int mtu; int err; odp_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family, @@ -1603,7 +1606,9 @@ static int odp_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, NLA_PUT(skb, ODP_VPORT_ATTR_ADDRESS, ETH_ALEN, vport_get_addr(vport)); - NLA_PUT_U32(skb, ODP_VPORT_ATTR_MTU, vport_get_mtu(vport)); + mtu = vport_get_mtu(vport); + if (mtu) + NLA_PUT_U32(skb, ODP_VPORT_ATTR_MTU, mtu); err = vport_get_options(vport, skb); if (err == -EMSGSIZE) diff --git a/datapath/tunnel.c b/datapath/tunnel.c index bd3a9e0a..6d0e7b9c 100644 --- a/datapath/tunnel.c +++ b/datapath/tunnel.c @@ -663,7 +663,6 @@ bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutab } #endif - total_length = min(total_length, mutable->mtu); payload_length = total_length - header_length; nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length + @@ -1428,7 +1427,6 @@ struct vport *tnl_create(const struct vport_parms *parms, } vport_gen_rand_ether_addr(mutable->eth_addr); - mutable->mtu = ETH_DATA_LEN; get_random_bytes(&initial_frag_id, sizeof(int)); atomic_set(&tnl_vport->frag_id, initial_frag_id); @@ -1477,7 +1475,6 @@ int tnl_set_options(struct vport *vport, struct nlattr *options) old_mutable = rtnl_dereference(tnl_vport->mutable); mutable->seq = old_mutable->seq + 1; memcpy(mutable->eth_addr, old_mutable->eth_addr, ETH_ALEN); - mutable->mtu = old_mutable->mtu; /* Parse the others configured by userspace. */ err = tnl_set_config(options, tnl_vport->tnl_ops, vport, mutable); @@ -1548,22 +1545,6 @@ int tnl_destroy(struct vport *vport) return 0; } -int tnl_set_mtu(struct vport *vport, int mtu) -{ - struct tnl_vport *tnl_vport = tnl_vport_priv(vport); - struct tnl_mutable_config *mutable; - - mutable = kmemdup(rtnl_dereference(tnl_vport->mutable), - sizeof(struct tnl_mutable_config), GFP_KERNEL); - if (!mutable) - return -ENOMEM; - - mutable->mtu = mtu; - assign_config_rcu(vport, mutable); - - return 0; -} - int tnl_set_addr(struct vport *vport, const unsigned char *addr) { struct tnl_vport *tnl_vport = tnl_vport_priv(vport); @@ -1592,12 +1573,6 @@ const unsigned char *tnl_get_addr(const struct vport *vport) return rcu_dereference_rtnl(tnl_vport->mutable)->eth_addr; } -int tnl_get_mtu(const struct vport *vport) -{ - const struct tnl_vport *tnl_vport = tnl_vport_priv(vport); - return rcu_dereference_rtnl(tnl_vport->mutable)->mtu; -} - void tnl_free_linked_skbs(struct sk_buff *skb) { if (unlikely(!skb)) diff --git a/datapath/tunnel.h b/datapath/tunnel.h index c3bc8ddc..6ce9c468 100644 --- a/datapath/tunnel.h +++ b/datapath/tunnel.h @@ -52,7 +52,6 @@ * @tunnel_hlen: Tunnel header length. * @eth_addr: Source address for packets generated by tunnel itself * (e.g. ICMP fragmentation needed messages). - * @mtu: MTU of tunnel. * @in_key: Key to match on input, 0 for wildcard. * @out_key: Key to use on output, 0 if this tunnel has no fixed output key. * @flags: TNL_F_* flags. @@ -70,7 +69,6 @@ struct tnl_mutable_config { unsigned tunnel_hlen; unsigned char eth_addr[ETH_ALEN]; - unsigned mtu; /* Configured via ODP_TUNNEL_ATTR_* attributes. */ __be64 in_key; @@ -223,11 +221,9 @@ int tnl_destroy(struct vport *); int tnl_set_options(struct vport *, struct nlattr *); int tnl_get_options(const struct vport *, struct sk_buff *); -int tnl_set_mtu(struct vport *vport, int mtu); int tnl_set_addr(struct vport *vport, const unsigned char *addr); const char *tnl_get_name(const struct vport *vport); const unsigned char *tnl_get_addr(const struct vport *vport); -int tnl_get_mtu(const struct vport *vport); int tnl_send(struct vport *vport, struct sk_buff *skb); void tnl_rcv(struct vport *vport, struct sk_buff *skb); diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c index 4fb43933..65f1f1bd 100644 --- a/datapath/vport-capwap.c +++ b/datapath/vport-capwap.c @@ -654,7 +654,6 @@ const struct vport_ops capwap_vport_ops = { .exit = capwap_exit, .create = capwap_create, .destroy = tnl_destroy, - .set_mtu = tnl_set_mtu, .set_addr = tnl_set_addr, .get_name = tnl_get_name, .get_addr = tnl_get_addr, @@ -663,7 +662,6 @@ const struct vport_ops capwap_vport_ops = { .get_dev_flags = vport_gen_get_dev_flags, .is_running = vport_gen_is_running, .get_operstate = vport_gen_get_operstate, - .get_mtu = tnl_get_mtu, .send = tnl_send, }; diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c index fc489e92..0f45f8f7 100644 --- a/datapath/vport-gre.c +++ b/datapath/vport-gre.c @@ -391,7 +391,6 @@ const struct vport_ops gre_vport_ops = { .exit = gre_exit, .create = gre_create, .destroy = tnl_destroy, - .set_mtu = tnl_set_mtu, .set_addr = tnl_set_addr, .get_name = tnl_get_name, .get_addr = tnl_get_addr, @@ -400,6 +399,5 @@ const struct vport_ops gre_vport_ops = { .get_dev_flags = vport_gen_get_dev_flags, .is_running = vport_gen_is_running, .get_operstate = vport_gen_get_operstate, - .get_mtu = tnl_get_mtu, .send = tnl_send, }; diff --git a/datapath/vport-patch.c b/datapath/vport-patch.c index 7dfea6ba..1c4d2c5e 100644 --- a/datapath/vport-patch.c +++ b/datapath/vport-patch.c @@ -20,7 +20,6 @@ struct patch_config { char peer_name[IFNAMSIZ]; unsigned char eth_addr[ETH_ALEN]; - unsigned int mtu; }; struct patch_vport { @@ -148,10 +147,6 @@ static struct vport *patch_create(const struct vport_parms *parms) vport_gen_rand_ether_addr(patchconf->eth_addr); - /* Make the default MTU fairly large so that it doesn't become the - * bottleneck on systems using jumbo frames. */ - patchconf->mtu = 65535; - rcu_assign_pointer(patch_vport->patchconf, patchconf); peer_name = patchconf->peer_name; @@ -236,22 +231,6 @@ static void update_peers(const char *name, struct vport *vport) } } -static int patch_set_mtu(struct vport *vport, int mtu) -{ - struct patch_vport *patch_vport = patch_vport_priv(vport); - struct patch_config *patchconf; - - patchconf = kmemdup(rtnl_dereference(patch_vport->patchconf), - sizeof(struct patch_config), GFP_KERNEL); - if (!patchconf) - return -ENOMEM; - - patchconf->mtu = mtu; - assign_config_rcu(vport, patchconf); - - return 0; -} - static int patch_set_addr(struct vport *vport, const unsigned char *addr) { struct patch_vport *patch_vport = patch_vport_priv(vport); @@ -289,12 +268,6 @@ static int patch_get_options(const struct vport *vport, struct sk_buff *skb) return nla_put_string(skb, ODP_PATCH_ATTR_PEER, patchconf->peer_name); } -static int patch_get_mtu(const struct vport *vport) -{ - const struct patch_vport *patch_vport = patch_vport_priv(vport); - return rcu_dereference_rtnl(patch_vport->patchconf)->mtu; -} - static int patch_send(struct vport *vport, struct sk_buff *skb) { struct patch_vport *patch_vport = patch_vport_priv(vport); @@ -319,7 +292,6 @@ const struct vport_ops patch_vport_ops = { .exit = patch_exit, .create = patch_create, .destroy = patch_destroy, - .set_mtu = patch_set_mtu, .set_addr = patch_set_addr, .get_name = patch_get_name, .get_addr = patch_get_addr, @@ -328,6 +300,5 @@ const struct vport_ops patch_vport_ops = { .get_dev_flags = vport_gen_get_dev_flags, .is_running = vport_gen_is_running, .get_operstate = vport_gen_get_operstate, - .get_mtu = patch_get_mtu, .send = patch_send, }; diff --git a/datapath/vport.c b/datapath/vport.c index 852a3c69..8ef96f73 100644 --- a/datapath/vport.c +++ b/datapath/vport.c @@ -605,12 +605,14 @@ int vport_get_iflink(const struct vport *vport) * * @vport: vport from which to retrieve MTU * - * Retrieves the MTU of the given device. - * - * Must be called with RTNL lock or rcu_read_lock. + * Retrieves the MTU of the given device. Returns 0 if @vport does not have an + * MTU (as e.g. some tunnels do not). Either RTNL lock or rcu_read_lock must + * be held. */ int vport_get_mtu(const struct vport *vport) { + if (!vport->ops->get_mtu) + return 0; return vport->ops->get_mtu(vport); } @@ -710,7 +712,7 @@ int vport_send(struct vport *vport, struct sk_buff *skb) int sent; mtu = vport_get_mtu(vport); - if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + if (mtu && unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { if (net_ratelimit()) pr_warn("%s: dropped over-mtu packet: %d > %d\n", dp_name(vport->dp), packet_length(skb), mtu); diff --git a/datapath/vport.h b/datapath/vport.h index ee3b127d..8e707628 100644 --- a/datapath/vport.h +++ b/datapath/vport.h @@ -175,7 +175,8 @@ struct vport_parms { * @get_iflink: Get the system interface index associated with the device that * will be used to send packets (may be different than ifindex for tunnels). * May be null if the device does not have an iflink. - * @get_mtu: Get the device's MTU. + * @get_mtu: Get the device's MTU. May be %NULL if the device does not have an + * MTU (as e.g. some tunnels do not). * @send: Send a packet on the device. Returns the length of the packet sent. */ struct vport_ops { diff --git a/include/openvswitch/datapath-protocol.h b/include/openvswitch/datapath-protocol.h index 8645096a..eafc8003 100644 --- a/include/openvswitch/datapath-protocol.h +++ b/include/openvswitch/datapath-protocol.h @@ -237,7 +237,8 @@ enum odp_vport_cmd { * @ODP_VPORT_ATTR_STATS: A &struct rtnl_link_stats64 giving statistics for * packets sent or received through the vport. * @ODP_VPORT_ATTR_ADDRESS: A 6-byte Ethernet address for the vport. - * @ODP_VPORT_ATTR_MTU: MTU for the vport. + * @ODP_VPORT_ATTR_MTU: MTU for the vport. Omitted if the vport does not have + * an MTU as, e.g., some tunnels do not. * @ODP_VPORT_ATTR_IFINDEX: ifindex of the underlying network device, if any. * @ODP_VPORT_ATTR_IFLINK: ifindex of the device on which packets are sent (for * tunnels), if any. diff --git a/lib/dhcp-client.c b/lib/dhcp-client.c index b9ebefe0..e59dc34c 100644 --- a/lib/dhcp-client.c +++ b/lib/dhcp-client.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008, 2009, 2010 Nicira Networks. + * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -934,10 +934,8 @@ do_receive_msg(struct dhclient *cli, struct dhcp_msg *msg) const char *cli_name = dhclient_get_name(cli); uint8_t cli_mac[ETH_ADDR_LEN]; struct ofpbuf b; - int mtu; - netdev_get_mtu(cli->netdev, &mtu); - ofpbuf_init(&b, mtu + VLAN_ETH_HEADER_LEN); + ofpbuf_init(&b, ETH_TOTAL_MAX + VLAN_ETH_HEADER_LEN); netdev_get_etheraddr(cli->netdev, cli_mac); for (; cli->received < 50; cli->received++) { const struct ip_header *ip; diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c index f1d42dc2..509174c8 100644 --- a/lib/dpif-linux.c +++ b/lib/dpif-linux.c @@ -1108,6 +1108,8 @@ dpif_linux_vport_from_ofpbuf(struct dpif_linux_vport *vport, } if (a[ODP_VPORT_ATTR_MTU]) { vport->mtu = nl_attr_get_u32(a[ODP_VPORT_ATTR_MTU]); + } else { + vport->mtu = INT_MAX; } if (a[ODP_VPORT_ATTR_OPTIONS]) { vport->options = nl_attr_get(a[ODP_VPORT_ATTR_OPTIONS]); @@ -1158,7 +1160,7 @@ dpif_linux_vport_to_ofpbuf(const struct dpif_linux_vport *vport, vport->address, ETH_ADDR_LEN); } - if (vport->mtu) { + if (vport->mtu && vport->mtu != INT_MAX) { nl_msg_put_u32(buf, ODP_VPORT_ATTR_MTU, vport->mtu); } diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index acc14a8b..93dd1cde 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -375,7 +375,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, port->internal = internal; netdev_get_mtu(netdev, &mtu); - if (mtu > max_mtu) { + if (mtu != INT_MAX && mtu > max_mtu) { max_mtu = mtu; } diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 9deea57d..1428ce66 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2365,6 +2365,11 @@ htb_setup_class__(struct netdev *netdev, unsigned int handle, int mtu; netdev_get_mtu(netdev, &mtu); + if (mtu == INT_MAX) { + VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU", + netdev_get_name(netdev)); + return EINVAL; + } memset(&opt, 0, sizeof opt); tc_fill_rate(&opt.rate, class->min_rate, mtu); @@ -2484,6 +2489,13 @@ htb_parse_class_details__(struct netdev *netdev, const char *priority_s = shash_find_data(details, "priority"); int mtu; + netdev_get_mtu(netdev, &mtu); + if (mtu == INT_MAX) { + VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU", + netdev_get_name(netdev)); + return EINVAL; + } + /* min-rate. Don't allow a min-rate below 1500 bytes/s. */ if (!min_rate_s) { /* min-rate is required. */ @@ -2509,7 +2521,6 @@ htb_parse_class_details__(struct netdev *netdev, * doesn't include the Ethernet header, we need to add at least 14 (18?) to * the MTU. We actually add 64, instead of 14, as a guard against * additional headers get tacked on somewhere that we're not aware of. */ - netdev_get_mtu(netdev, &mtu); hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0; hc->burst = MAX(hc->burst, mtu + 64); diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index 4181758f..be51c48a 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2010 Nicira Networks. + * Copyright (c) 2009, 2010, 2011 Nicira Networks. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -237,7 +237,10 @@ struct netdev_class { * * The MTU is the maximum size of transmitted (and received) packets, in * bytes, not including the hardware header; thus, this is typically 1500 - * bytes for Ethernet devices.*/ + * bytes for Ethernet devices. + * + * If 'netdev' does not have an MTU (e.g. as some tunnels do not), then + * this function should set '*mtup' to INT_MAX. */ int (*get_mtu)(const struct netdev *netdev, int *mtup); /* Returns the ifindex of 'netdev', if successful, as a positive number. diff --git a/lib/netdev.c b/lib/netdev.c index 24b616a3..f06742ae 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008, 2009, 2010 Nicira Networks. + * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -521,8 +521,9 @@ netdev_get_name(const struct netdev *netdev) * (and received) packets, in bytes, not including the hardware header; thus, * this is typically 1500 bytes for Ethernet devices. * - * If successful, returns 0 and stores the MTU size in '*mtup'. On failure, - * returns a positive errno value and stores ETH_PAYLOAD_MAX (1500) in + * If successful, returns 0 and stores the MTU size in '*mtup'. Stores INT_MAX + * in '*mtup' if 'netdev' does not have an MTU (as e.g. some tunnels do not).On + * failure, returns a positive errno value and stores ETH_PAYLOAD_MAX (1500) in * '*mtup'. */ int netdev_get_mtu(const struct netdev *netdev, int *mtup) diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 54830dcd..bb67e17f 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1207,7 +1207,7 @@ iface_refresh_status(struct iface *iface) ? "up" : "down"); error = netdev_get_mtu(iface->netdev, &mtu); - if (!error) { + if (!error && mtu != INT_MAX) { mtu_64 = mtu; ovsrec_interface_set_mtu(iface->cfg, &mtu_64, 1); } diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index bb7afe88..f4515f4f 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -1065,6 +1065,10 @@ and many kinds of virtual interfaces can be configured with higher MTUs.

+

+ This column will be empty for an interface that does not + have an MTU as, for example, some kinds of tunnels do not. +