From f915f1a8ca180828983ef22cf2fd21b8f010b972 Mon Sep 17 00:00:00 2001
From: Ben Pfaff
Date: Tue, 1 Feb 2011 11:32:06 -0800
Subject: [PATCH] datapath: Consider tunnels to have no MTU, fixing jumbo frame
support.
Until now, tunnel vports have had a specific MTU, in the same way that
ordinary network devices have an MTU, but treating them this way does not
always make sense. For example, consider a datapath that has three ports:
the local port, a GRE tunnel to another host, and a physical port. If
the physical port is configured with a jumbo MTU, it should be possible to
send jumbo packets across the tunnel: the tunnel can do fragmentation or
the physical port traversed by the tunnel might have a jumbo MTU.
However, until now, tunnels always had a 1500-byte MTU by default. It
could be adjusted using ODP_VPORT_MTU_SET, but nothing actually did this.
One alternative would be to make ovs-vswitchd able to set the vport's MTU.
This commit, however, takes a different approach, of dropping the concept
of MTU entirely for tunnel vports. This also solves the problem described
above, without making any additional work for anyone.
I tested that, without this change, I could not send 1600-byte "pings"
between two machines whose NICs had 2000-byte MTUs that were connected to
vswitches that were in turn connected over GRE tunnels with the default
1500-byte MTU. With this change, it worked OK, regardless of the MTU of
the network traversed by the GRE tunnel.
This patch also makes "patch" ports MTU-less.
It might make sense to remove vport_set_mtu() and the associated callback
now, since ordinary network devices are the only vports that support it
now.
Signed-off-by: Ben Pfaff
Suggested-by: Jesse Gross
Acked-by: Jesse Gross
Bug #3728.
---
datapath/datapath.c | 7 +++++-
datapath/tunnel.c | 25 ---------------------
datapath/tunnel.h | 4 ----
datapath/vport-capwap.c | 2 --
datapath/vport-gre.c | 2 --
datapath/vport-patch.c | 29 -------------------------
datapath/vport.c | 10 +++++----
datapath/vport.h | 3 ++-
include/openvswitch/datapath-protocol.h | 3 ++-
lib/dhcp-client.c | 6 ++---
lib/dpif-linux.c | 4 +++-
lib/dpif-netdev.c | 2 +-
lib/netdev-linux.c | 13 ++++++++++-
lib/netdev-provider.h | 7 ++++--
lib/netdev.c | 7 +++---
vswitchd/bridge.c | 2 +-
vswitchd/vswitch.xml | 4 ++++
17 files changed, 48 insertions(+), 82 deletions(-)
diff --git a/datapath/datapath.c b/datapath/datapath.c
index 74f276b5..ba32e37f 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -773,6 +773,8 @@ int dp_min_mtu(const struct datapath *dp)
continue;
dev_mtu = vport_get_mtu(p);
+ if (!dev_mtu)
+ continue;
if (!mtu || dev_mtu < mtu)
mtu = dev_mtu;
}
@@ -1582,6 +1584,7 @@ static int odp_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
struct odp_header *odp_header;
struct nlattr *nla;
int ifindex, iflink;
+ int mtu;
int err;
odp_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family,
@@ -1603,7 +1606,9 @@ static int odp_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
NLA_PUT(skb, ODP_VPORT_ATTR_ADDRESS, ETH_ALEN, vport_get_addr(vport));
- NLA_PUT_U32(skb, ODP_VPORT_ATTR_MTU, vport_get_mtu(vport));
+ mtu = vport_get_mtu(vport);
+ if (mtu)
+ NLA_PUT_U32(skb, ODP_VPORT_ATTR_MTU, mtu);
err = vport_get_options(vport, skb);
if (err == -EMSGSIZE)
diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index bd3a9e0a..6d0e7b9c 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -663,7 +663,6 @@ bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutab
}
#endif
- total_length = min(total_length, mutable->mtu);
payload_length = total_length - header_length;
nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length +
@@ -1428,7 +1427,6 @@ struct vport *tnl_create(const struct vport_parms *parms,
}
vport_gen_rand_ether_addr(mutable->eth_addr);
- mutable->mtu = ETH_DATA_LEN;
get_random_bytes(&initial_frag_id, sizeof(int));
atomic_set(&tnl_vport->frag_id, initial_frag_id);
@@ -1477,7 +1475,6 @@ int tnl_set_options(struct vport *vport, struct nlattr *options)
old_mutable = rtnl_dereference(tnl_vport->mutable);
mutable->seq = old_mutable->seq + 1;
memcpy(mutable->eth_addr, old_mutable->eth_addr, ETH_ALEN);
- mutable->mtu = old_mutable->mtu;
/* Parse the others configured by userspace. */
err = tnl_set_config(options, tnl_vport->tnl_ops, vport, mutable);
@@ -1548,22 +1545,6 @@ int tnl_destroy(struct vport *vport)
return 0;
}
-int tnl_set_mtu(struct vport *vport, int mtu)
-{
- struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
- struct tnl_mutable_config *mutable;
-
- mutable = kmemdup(rtnl_dereference(tnl_vport->mutable),
- sizeof(struct tnl_mutable_config), GFP_KERNEL);
- if (!mutable)
- return -ENOMEM;
-
- mutable->mtu = mtu;
- assign_config_rcu(vport, mutable);
-
- return 0;
-}
-
int tnl_set_addr(struct vport *vport, const unsigned char *addr)
{
struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
@@ -1592,12 +1573,6 @@ const unsigned char *tnl_get_addr(const struct vport *vport)
return rcu_dereference_rtnl(tnl_vport->mutable)->eth_addr;
}
-int tnl_get_mtu(const struct vport *vport)
-{
- const struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
- return rcu_dereference_rtnl(tnl_vport->mutable)->mtu;
-}
-
void tnl_free_linked_skbs(struct sk_buff *skb)
{
if (unlikely(!skb))
diff --git a/datapath/tunnel.h b/datapath/tunnel.h
index c3bc8ddc..6ce9c468 100644
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -52,7 +52,6 @@
* @tunnel_hlen: Tunnel header length.
* @eth_addr: Source address for packets generated by tunnel itself
* (e.g. ICMP fragmentation needed messages).
- * @mtu: MTU of tunnel.
* @in_key: Key to match on input, 0 for wildcard.
* @out_key: Key to use on output, 0 if this tunnel has no fixed output key.
* @flags: TNL_F_* flags.
@@ -70,7 +69,6 @@ struct tnl_mutable_config {
unsigned tunnel_hlen;
unsigned char eth_addr[ETH_ALEN];
- unsigned mtu;
/* Configured via ODP_TUNNEL_ATTR_* attributes. */
__be64 in_key;
@@ -223,11 +221,9 @@ int tnl_destroy(struct vport *);
int tnl_set_options(struct vport *, struct nlattr *);
int tnl_get_options(const struct vport *, struct sk_buff *);
-int tnl_set_mtu(struct vport *vport, int mtu);
int tnl_set_addr(struct vport *vport, const unsigned char *addr);
const char *tnl_get_name(const struct vport *vport);
const unsigned char *tnl_get_addr(const struct vport *vport);
-int tnl_get_mtu(const struct vport *vport);
int tnl_send(struct vport *vport, struct sk_buff *skb);
void tnl_rcv(struct vport *vport, struct sk_buff *skb);
diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
index 4fb43933..65f1f1bd 100644
--- a/datapath/vport-capwap.c
+++ b/datapath/vport-capwap.c
@@ -654,7 +654,6 @@ const struct vport_ops capwap_vport_ops = {
.exit = capwap_exit,
.create = capwap_create,
.destroy = tnl_destroy,
- .set_mtu = tnl_set_mtu,
.set_addr = tnl_set_addr,
.get_name = tnl_get_name,
.get_addr = tnl_get_addr,
@@ -663,7 +662,6 @@ const struct vport_ops capwap_vport_ops = {
.get_dev_flags = vport_gen_get_dev_flags,
.is_running = vport_gen_is_running,
.get_operstate = vport_gen_get_operstate,
- .get_mtu = tnl_get_mtu,
.send = tnl_send,
};
diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
index fc489e92..0f45f8f7 100644
--- a/datapath/vport-gre.c
+++ b/datapath/vport-gre.c
@@ -391,7 +391,6 @@ const struct vport_ops gre_vport_ops = {
.exit = gre_exit,
.create = gre_create,
.destroy = tnl_destroy,
- .set_mtu = tnl_set_mtu,
.set_addr = tnl_set_addr,
.get_name = tnl_get_name,
.get_addr = tnl_get_addr,
@@ -400,6 +399,5 @@ const struct vport_ops gre_vport_ops = {
.get_dev_flags = vport_gen_get_dev_flags,
.is_running = vport_gen_is_running,
.get_operstate = vport_gen_get_operstate,
- .get_mtu = tnl_get_mtu,
.send = tnl_send,
};
diff --git a/datapath/vport-patch.c b/datapath/vport-patch.c
index 7dfea6ba..1c4d2c5e 100644
--- a/datapath/vport-patch.c
+++ b/datapath/vport-patch.c
@@ -20,7 +20,6 @@ struct patch_config {
char peer_name[IFNAMSIZ];
unsigned char eth_addr[ETH_ALEN];
- unsigned int mtu;
};
struct patch_vport {
@@ -148,10 +147,6 @@ static struct vport *patch_create(const struct vport_parms *parms)
vport_gen_rand_ether_addr(patchconf->eth_addr);
- /* Make the default MTU fairly large so that it doesn't become the
- * bottleneck on systems using jumbo frames. */
- patchconf->mtu = 65535;
-
rcu_assign_pointer(patch_vport->patchconf, patchconf);
peer_name = patchconf->peer_name;
@@ -236,22 +231,6 @@ static void update_peers(const char *name, struct vport *vport)
}
}
-static int patch_set_mtu(struct vport *vport, int mtu)
-{
- struct patch_vport *patch_vport = patch_vport_priv(vport);
- struct patch_config *patchconf;
-
- patchconf = kmemdup(rtnl_dereference(patch_vport->patchconf),
- sizeof(struct patch_config), GFP_KERNEL);
- if (!patchconf)
- return -ENOMEM;
-
- patchconf->mtu = mtu;
- assign_config_rcu(vport, patchconf);
-
- return 0;
-}
-
static int patch_set_addr(struct vport *vport, const unsigned char *addr)
{
struct patch_vport *patch_vport = patch_vport_priv(vport);
@@ -289,12 +268,6 @@ static int patch_get_options(const struct vport *vport, struct sk_buff *skb)
return nla_put_string(skb, ODP_PATCH_ATTR_PEER, patchconf->peer_name);
}
-static int patch_get_mtu(const struct vport *vport)
-{
- const struct patch_vport *patch_vport = patch_vport_priv(vport);
- return rcu_dereference_rtnl(patch_vport->patchconf)->mtu;
-}
-
static int patch_send(struct vport *vport, struct sk_buff *skb)
{
struct patch_vport *patch_vport = patch_vport_priv(vport);
@@ -319,7 +292,6 @@ const struct vport_ops patch_vport_ops = {
.exit = patch_exit,
.create = patch_create,
.destroy = patch_destroy,
- .set_mtu = patch_set_mtu,
.set_addr = patch_set_addr,
.get_name = patch_get_name,
.get_addr = patch_get_addr,
@@ -328,6 +300,5 @@ const struct vport_ops patch_vport_ops = {
.get_dev_flags = vport_gen_get_dev_flags,
.is_running = vport_gen_is_running,
.get_operstate = vport_gen_get_operstate,
- .get_mtu = patch_get_mtu,
.send = patch_send,
};
diff --git a/datapath/vport.c b/datapath/vport.c
index 852a3c69..8ef96f73 100644
--- a/datapath/vport.c
+++ b/datapath/vport.c
@@ -605,12 +605,14 @@ int vport_get_iflink(const struct vport *vport)
*
* @vport: vport from which to retrieve MTU
*
- * Retrieves the MTU of the given device.
- *
- * Must be called with RTNL lock or rcu_read_lock.
+ * Retrieves the MTU of the given device. Returns 0 if @vport does not have an
+ * MTU (as e.g. some tunnels do not). Either RTNL lock or rcu_read_lock must
+ * be held.
*/
int vport_get_mtu(const struct vport *vport)
{
+ if (!vport->ops->get_mtu)
+ return 0;
return vport->ops->get_mtu(vport);
}
@@ -710,7 +712,7 @@ int vport_send(struct vport *vport, struct sk_buff *skb)
int sent;
mtu = vport_get_mtu(vport);
- if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
+ if (mtu && unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
if (net_ratelimit())
pr_warn("%s: dropped over-mtu packet: %d > %d\n",
dp_name(vport->dp), packet_length(skb), mtu);
diff --git a/datapath/vport.h b/datapath/vport.h
index ee3b127d..8e707628 100644
--- a/datapath/vport.h
+++ b/datapath/vport.h
@@ -175,7 +175,8 @@ struct vport_parms {
* @get_iflink: Get the system interface index associated with the device that
* will be used to send packets (may be different than ifindex for tunnels).
* May be null if the device does not have an iflink.
- * @get_mtu: Get the device's MTU.
+ * @get_mtu: Get the device's MTU. May be %NULL if the device does not have an
+ * MTU (as e.g. some tunnels do not).
* @send: Send a packet on the device. Returns the length of the packet sent.
*/
struct vport_ops {
diff --git a/include/openvswitch/datapath-protocol.h b/include/openvswitch/datapath-protocol.h
index 8645096a..eafc8003 100644
--- a/include/openvswitch/datapath-protocol.h
+++ b/include/openvswitch/datapath-protocol.h
@@ -237,7 +237,8 @@ enum odp_vport_cmd {
* @ODP_VPORT_ATTR_STATS: A &struct rtnl_link_stats64 giving statistics for
* packets sent or received through the vport.
* @ODP_VPORT_ATTR_ADDRESS: A 6-byte Ethernet address for the vport.
- * @ODP_VPORT_ATTR_MTU: MTU for the vport.
+ * @ODP_VPORT_ATTR_MTU: MTU for the vport. Omitted if the vport does not have
+ * an MTU as, e.g., some tunnels do not.
* @ODP_VPORT_ATTR_IFINDEX: ifindex of the underlying network device, if any.
* @ODP_VPORT_ATTR_IFLINK: ifindex of the device on which packets are sent (for
* tunnels), if any.
diff --git a/lib/dhcp-client.c b/lib/dhcp-client.c
index b9ebefe0..e59dc34c 100644
--- a/lib/dhcp-client.c
+++ b/lib/dhcp-client.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -934,10 +934,8 @@ do_receive_msg(struct dhclient *cli, struct dhcp_msg *msg)
const char *cli_name = dhclient_get_name(cli);
uint8_t cli_mac[ETH_ADDR_LEN];
struct ofpbuf b;
- int mtu;
- netdev_get_mtu(cli->netdev, &mtu);
- ofpbuf_init(&b, mtu + VLAN_ETH_HEADER_LEN);
+ ofpbuf_init(&b, ETH_TOTAL_MAX + VLAN_ETH_HEADER_LEN);
netdev_get_etheraddr(cli->netdev, cli_mac);
for (; cli->received < 50; cli->received++) {
const struct ip_header *ip;
diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index f1d42dc2..509174c8 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -1108,6 +1108,8 @@ dpif_linux_vport_from_ofpbuf(struct dpif_linux_vport *vport,
}
if (a[ODP_VPORT_ATTR_MTU]) {
vport->mtu = nl_attr_get_u32(a[ODP_VPORT_ATTR_MTU]);
+ } else {
+ vport->mtu = INT_MAX;
}
if (a[ODP_VPORT_ATTR_OPTIONS]) {
vport->options = nl_attr_get(a[ODP_VPORT_ATTR_OPTIONS]);
@@ -1158,7 +1160,7 @@ dpif_linux_vport_to_ofpbuf(const struct dpif_linux_vport *vport,
vport->address, ETH_ADDR_LEN);
}
- if (vport->mtu) {
+ if (vport->mtu && vport->mtu != INT_MAX) {
nl_msg_put_u32(buf, ODP_VPORT_ATTR_MTU, vport->mtu);
}
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index acc14a8b..93dd1cde 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -375,7 +375,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
port->internal = internal;
netdev_get_mtu(netdev, &mtu);
- if (mtu > max_mtu) {
+ if (mtu != INT_MAX && mtu > max_mtu) {
max_mtu = mtu;
}
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 9deea57d..1428ce66 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2365,6 +2365,11 @@ htb_setup_class__(struct netdev *netdev, unsigned int handle,
int mtu;
netdev_get_mtu(netdev, &mtu);
+ if (mtu == INT_MAX) {
+ VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
+ netdev_get_name(netdev));
+ return EINVAL;
+ }
memset(&opt, 0, sizeof opt);
tc_fill_rate(&opt.rate, class->min_rate, mtu);
@@ -2484,6 +2489,13 @@ htb_parse_class_details__(struct netdev *netdev,
const char *priority_s = shash_find_data(details, "priority");
int mtu;
+ netdev_get_mtu(netdev, &mtu);
+ if (mtu == INT_MAX) {
+ VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
+ netdev_get_name(netdev));
+ return EINVAL;
+ }
+
/* min-rate. Don't allow a min-rate below 1500 bytes/s. */
if (!min_rate_s) {
/* min-rate is required. */
@@ -2509,7 +2521,6 @@ htb_parse_class_details__(struct netdev *netdev,
* doesn't include the Ethernet header, we need to add at least 14 (18?) to
* the MTU. We actually add 64, instead of 14, as a guard against
* additional headers get tacked on somewhere that we're not aware of. */
- netdev_get_mtu(netdev, &mtu);
hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
hc->burst = MAX(hc->burst, mtu + 64);
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 4181758f..be51c48a 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009, 2010 Nicira Networks.
+ * Copyright (c) 2009, 2010, 2011 Nicira Networks.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -237,7 +237,10 @@ struct netdev_class {
*
* The MTU is the maximum size of transmitted (and received) packets, in
* bytes, not including the hardware header; thus, this is typically 1500
- * bytes for Ethernet devices.*/
+ * bytes for Ethernet devices.
+ *
+ * If 'netdev' does not have an MTU (e.g. as some tunnels do not), then
+ * this function should set '*mtup' to INT_MAX. */
int (*get_mtu)(const struct netdev *netdev, int *mtup);
/* Returns the ifindex of 'netdev', if successful, as a positive number.
diff --git a/lib/netdev.c b/lib/netdev.c
index 24b616a3..f06742ae 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -521,8 +521,9 @@ netdev_get_name(const struct netdev *netdev)
* (and received) packets, in bytes, not including the hardware header; thus,
* this is typically 1500 bytes for Ethernet devices.
*
- * If successful, returns 0 and stores the MTU size in '*mtup'. On failure,
- * returns a positive errno value and stores ETH_PAYLOAD_MAX (1500) in
+ * If successful, returns 0 and stores the MTU size in '*mtup'. Stores INT_MAX
+ * in '*mtup' if 'netdev' does not have an MTU (as e.g. some tunnels do not).On
+ * failure, returns a positive errno value and stores ETH_PAYLOAD_MAX (1500) in
* '*mtup'. */
int
netdev_get_mtu(const struct netdev *netdev, int *mtup)
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
index 54830dcd..bb67e17f 100644
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -1207,7 +1207,7 @@ iface_refresh_status(struct iface *iface)
? "up" : "down");
error = netdev_get_mtu(iface->netdev, &mtu);
- if (!error) {
+ if (!error && mtu != INT_MAX) {
mtu_64 = mtu;
ovsrec_interface_set_mtu(iface->cfg, &mtu_64, 1);
}
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index bb7afe88..f4515f4f 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1065,6 +1065,10 @@
and many kinds of virtual interfaces can be configured with
higher MTUs.
+
+ This column will be empty for an interface that does not
+ have an MTU as, for example, some kinds of tunnels do not.
+
--
2.30.2