From: Jesse Gross Date: Wed, 11 Aug 2010 00:11:48 +0000 (-0400) Subject: datapath: Abstract tunneling implementation from GRE. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d1eb60ccff0c2dbef2300cdfc5fead3c2d394beb;p=openvswitch datapath: Abstract tunneling implementation from GRE. Much of the code in the GRE implementation is not specific to the GRE protocol but is actually common to all types of tunnels. In order to support future types of tunnels, move this code into a common library. Signed-off-by: Jesse Gross --- diff --git a/datapath/Modules.mk b/datapath/Modules.mk index 7ae1383c..c158d39e 100644 --- a/datapath/Modules.mk +++ b/datapath/Modules.mk @@ -17,6 +17,7 @@ openvswitch_sources = \ dp_sysfs_if.c \ flow.c \ table.c \ + tunnel.c \ vport.c \ vport-generic.c \ vport-gre.c \ @@ -32,6 +33,7 @@ openvswitch_headers = \ flow.h \ odp-compat.h \ table.h \ + tunnel.h \ vport.h \ vport-generic.h \ vport-internal_dev.h \ diff --git a/datapath/linux-2.6/.gitignore b/datapath/linux-2.6/.gitignore index bd99f240..b1d44b87 100644 --- a/datapath/linux-2.6/.gitignore +++ b/datapath/linux-2.6/.gitignore @@ -27,7 +27,7 @@ /table.c /time.c /tmp -/veth.c +/tunnel.c /vport-generic.c /vport-gre.c /vport-internal_dev.c diff --git a/datapath/tunnel.c b/datapath/tunnel.c new file mode 100644 index 00000000..3f25c9b4 --- /dev/null +++ b/datapath/tunnel.c @@ -0,0 +1,1084 @@ +/* + * Copyright (c) 2010 Nicira Networks. + * Distributed under the terms of the GNU GPL version 2. + * + * Significant portions of this file may be copied from parts of the Linux + * kernel, by Linus Torvalds and others. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#endif +#include +#include + +#include "actions.h" +#include "datapath.h" +#include "table.h" +#include "tunnel.h" +#include "vport.h" +#include "vport-generic.h" + +/* Protected by RCU. */ +static struct tbl *port_table; + +/* + * These are just used as an optimization: they don't require any kind of + * synchronization because we could have just as easily read the value before + * the port change happened. + */ +static unsigned int key_local_remote_ports; +static unsigned int key_remote_ports; +static unsigned int local_remote_ports; +static unsigned int remote_ports; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) +#define rt_dst(rt) (rt->dst) +#else +#define rt_dst(rt) (rt->u.dst) +#endif + +static inline struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport) +{ + return vport_from_priv(tnl_vport); +} + +static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node *node) +{ + return container_of(node, struct tnl_vport, tbl_node); +} + +/* RCU callback. */ +static void free_config(struct rcu_head *rcu) +{ + struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu); + kfree(c); +} + +static void assign_config_rcu(struct vport *vport, + struct tnl_mutable_config *new_config) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + struct tnl_mutable_config *old_config; + + old_config = rcu_dereference(tnl_vport->mutable); + rcu_assign_pointer(tnl_vport->mutable, new_config); + call_rcu(&old_config->rcu, free_config); +} + +static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable) +{ + if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) { + if (mutable->port_config.saddr) + return &local_remote_ports; + else + return &remote_ports; + } else { + if (mutable->port_config.saddr) + return &key_local_remote_ports; + else + return &key_remote_ports; + } +} + +enum lookup_key { + LOOKUP_TUNNEL_TYPE = 0, + LOOKUP_SADDR = 1, + LOOKUP_DADDR = 2, + LOOKUP_KEY = 3, +}; + +struct port_lookup_key { + u32 vals[4]; /* Contains enum lookup_key keys. */ + const struct tnl_mutable_config *mutable; +}; + +/* + * Modifies 'target' to store the rcu_dereferenced pointer that was used to do + * the comparision. + */ +static int port_cmp(const struct tbl_node *node, void *target) +{ + const struct tnl_vport *tnl_vport = tnl_vport_table_cast(node); + struct port_lookup_key *lookup = target; + + lookup->mutable = rcu_dereference(tnl_vport->mutable); + + return (lookup->mutable->tunnel_type == lookup->vals[LOOKUP_TUNNEL_TYPE]) && + lookup->mutable->port_config.daddr == lookup->vals[LOOKUP_DADDR] && + lookup->mutable->port_config.in_key == lookup->vals[LOOKUP_KEY] && + lookup->mutable->port_config.saddr == lookup->vals[LOOKUP_SADDR]; +} + +static u32 port_hash(struct port_lookup_key *lookup) +{ + return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0); +} + +static int add_port(struct vport *vport) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + struct port_lookup_key lookup; + int err; + + if (!port_table) { + struct tbl *new_table; + + new_table = tbl_create(0); + if (!new_table) + return -ENOMEM; + + rcu_assign_pointer(port_table, new_table); + + } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) { + struct tbl *old_table = port_table; + struct tbl *new_table; + + new_table = tbl_expand(old_table); + if (IS_ERR(new_table)) + return PTR_ERR(new_table); + + rcu_assign_pointer(port_table, new_table); + tbl_deferred_destroy(old_table, NULL); + } + + lookup.vals[LOOKUP_SADDR] = tnl_vport->mutable->port_config.saddr; + lookup.vals[LOOKUP_DADDR] = tnl_vport->mutable->port_config.daddr; + lookup.vals[LOOKUP_KEY] = tnl_vport->mutable->port_config.in_key; + lookup.vals[LOOKUP_TUNNEL_TYPE] = tnl_vport->mutable->tunnel_type; + + err = tbl_insert(port_table, &tnl_vport->tbl_node, port_hash(&lookup)); + if (err) + return err; + + (*find_port_pool(tnl_vport->mutable))++; + + return 0; +} + +static int del_port(struct vport *vport) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + int err; + + err = tbl_remove(port_table, &tnl_vport->tbl_node); + if (err) + return err; + + (*find_port_pool(tnl_vport->mutable))--; + + return 0; +} + +struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key, + int tunnel_type, + const struct tnl_mutable_config **mutable) +{ + struct port_lookup_key lookup; + struct tbl *table = rcu_dereference(port_table); + struct tbl_node *tbl_node; + + if (!table) + return NULL; + + lookup.vals[LOOKUP_SADDR] = saddr; + lookup.vals[LOOKUP_DADDR] = daddr; + + if (tunnel_type & TNL_T_KEY_EXACT) { + lookup.vals[LOOKUP_KEY] = key; + lookup.vals[LOOKUP_TUNNEL_TYPE] = tunnel_type & ~TNL_T_KEY_MATCH; + + if (key_local_remote_ports) { + tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp); + if (tbl_node) + goto found; + } + + if (key_remote_ports) { + lookup.vals[LOOKUP_SADDR] = 0; + + tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp); + if (tbl_node) + goto found; + + lookup.vals[LOOKUP_SADDR] = saddr; + } + } + + if (tunnel_type & TNL_T_KEY_MATCH) { + lookup.vals[LOOKUP_KEY] = 0; + lookup.vals[LOOKUP_TUNNEL_TYPE] = tunnel_type & ~TNL_T_KEY_EXACT; + + if (local_remote_ports) { + tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp); + if (tbl_node) + goto found; + } + + if (remote_ports) { + lookup.vals[LOOKUP_SADDR] = 0; + + tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp); + if (tbl_node) + goto found; + } + } + + return NULL; + +found: + *mutable = lookup.mutable; + return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node)); +} + +static bool check_ipv4_address(__be32 addr) +{ + if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) + || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr)) + return false; + + return true; +} + +static bool ipv4_should_icmp(struct sk_buff *skb) +{ + struct iphdr *old_iph = ip_hdr(skb); + + /* Don't respond to L2 broadcast. */ + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) + return false; + + /* Don't respond to L3 broadcast or invalid addresses. */ + if (!check_ipv4_address(old_iph->daddr) || + !check_ipv4_address(old_iph->saddr)) + return false; + + /* Only respond to the first fragment. */ + if (old_iph->frag_off & htons(IP_OFFSET)) + return false; + + /* Don't respond to ICMP error messages. */ + if (old_iph->protocol == IPPROTO_ICMP) { + u8 icmp_type, *icmp_typep; + + icmp_typep = skb_header_pointer(skb, (u8 *)old_iph + + (old_iph->ihl << 2) + + offsetof(struct icmphdr, type) - + skb->data, sizeof(icmp_type), + &icmp_type); + + if (!icmp_typep) + return false; + + if (*icmp_typep > NR_ICMP_TYPES + || (*icmp_typep <= ICMP_PARAMETERPROB + && *icmp_typep != ICMP_ECHOREPLY + && *icmp_typep != ICMP_ECHO)) + return false; + } + + return true; +} + +static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb, + unsigned int mtu, unsigned int payload_length) +{ + struct iphdr *iph, *old_iph = ip_hdr(skb); + struct icmphdr *icmph; + u8 *payload; + + iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr)); + payload = skb_put(nskb, payload_length); + + /* IP */ + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->tos = (old_iph->tos & IPTOS_TOS_MASK) | + IPTOS_PREC_INTERNETCONTROL; + iph->tot_len = htons(sizeof(struct iphdr) + + sizeof(struct icmphdr) + + payload_length); + get_random_bytes(&iph->id, sizeof(iph->id)); + iph->frag_off = 0; + iph->ttl = IPDEFTTL; + iph->protocol = IPPROTO_ICMP; + iph->daddr = old_iph->saddr; + iph->saddr = old_iph->daddr; + + ip_send_check(iph); + + /* ICMP */ + icmph->type = ICMP_DEST_UNREACH; + icmph->code = ICMP_FRAG_NEEDED; + icmph->un.gateway = htonl(mtu); + icmph->checksum = 0; + + nskb->csum = csum_partial((u8 *)icmph, sizeof(struct icmphdr), 0); + nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data, + payload, payload_length, + nskb->csum); + icmph->checksum = csum_fold(nskb->csum); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static bool ipv6_should_icmp(struct sk_buff *skb) +{ + struct ipv6hdr *old_ipv6h = ipv6_hdr(skb); + int addr_type; + int payload_off = (u8 *)(old_ipv6h + 1) - skb->data; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + + /* Check source address is valid. */ + addr_type = ipv6_addr_type(&old_ipv6h->saddr); + if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY) + return false; + + /* Don't reply to unspecified addresses. */ + if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY) + return false; + + /* Don't respond to ICMP error messages. */ + payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr); + if (payload_off < 0) + return false; + + if (nexthdr == NEXTHDR_ICMP) { + u8 icmp_type, *icmp_typep; + + icmp_typep = skb_header_pointer(skb, payload_off + + offsetof(struct icmp6hdr, + icmp6_type), + sizeof(icmp_type), &icmp_type); + + if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK)) + return false; + } + + return true; +} + +static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb, + unsigned int mtu, unsigned int payload_length) +{ + struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb); + struct icmp6hdr *icmp6h; + u8 *payload; + + ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr)); + icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr)); + payload = skb_put(nskb, payload_length); + + /* IPv6 */ + ipv6h->version = 6; + ipv6h->priority = 0; + memset(&ipv6h->flow_lbl, 0, sizeof(ipv6h->flow_lbl)); + ipv6h->payload_len = htons(sizeof(struct icmp6hdr) + + payload_length); + ipv6h->nexthdr = NEXTHDR_ICMP; + ipv6h->hop_limit = IPV6_DEFAULT_HOPLIMIT; + ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr); + ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr); + + /* ICMPv6 */ + icmp6h->icmp6_type = ICMPV6_PKT_TOOBIG; + icmp6h->icmp6_code = 0; + icmp6h->icmp6_cksum = 0; + icmp6h->icmp6_mtu = htonl(mtu); + + nskb->csum = csum_partial((u8 *)icmp6h, sizeof(struct icmp6hdr), 0); + nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data, + payload, payload_length, + nskb->csum); + icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + sizeof(struct icmp6hdr) + + payload_length, + ipv6h->nexthdr, nskb->csum); +} +#endif /* IPv6 */ + +bool tnl_frag_needed(struct vport *vport, const struct tnl_mutable_config *mutable, + struct sk_buff *skb, unsigned int mtu, __be32 flow_key) +{ + unsigned int eth_hdr_len = ETH_HLEN; + unsigned int total_length = 0, header_length = 0, payload_length; + struct ethhdr *eh, *old_eh = eth_hdr(skb); + struct sk_buff *nskb; + + /* Sanity check */ + if (skb->protocol == htons(ETH_P_IP)) { + if (mtu < IP_MIN_MTU) + return false; + + if (!ipv4_should_icmp(skb)) + return true; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + return false; + + /* + * In theory we should do PMTUD on IPv6 multicast messages but + * we don't have an address to send from so just fragment. + */ + if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) + return false; + + if (!ipv6_should_icmp(skb)) + return true; + } +#endif + else + return false; + + /* Allocate */ + if (old_eh->h_proto == htons(ETH_P_8021Q)) + eth_hdr_len = VLAN_ETH_HLEN; + + payload_length = skb->len - eth_hdr_len; + if (skb->protocol == htons(ETH_P_IP)) { + header_length = sizeof(struct iphdr) + sizeof(struct icmphdr); + total_length = min_t(unsigned int, header_length + + payload_length, 576); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else { + header_length = sizeof(struct ipv6hdr) + + sizeof(struct icmp6hdr); + total_length = min_t(unsigned int, header_length + + payload_length, IPV6_MIN_MTU); + } +#endif + + total_length = min(total_length, mutable->mtu); + payload_length = total_length - header_length; + + nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length + + payload_length); + if (!nskb) + return false; + + skb_reserve(nskb, NET_IP_ALIGN); + + /* Ethernet / VLAN */ + eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len); + memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN); + memcpy(eh->h_source, mutable->eth_addr, ETH_ALEN); + nskb->protocol = eh->h_proto = old_eh->h_proto; + if (old_eh->h_proto == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh; + + vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI; + vh->h_vlan_encapsulated_proto = skb->protocol; + } + skb_reset_mac_header(nskb); + + /* Protocol */ + if (skb->protocol == htons(ETH_P_IP)) + ipv4_build_icmp(skb, nskb, mtu, payload_length); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else + ipv6_build_icmp(skb, nskb, mtu, payload_length); +#endif + + /* + * Assume that flow based keys are symmetric with respect to input + * and output and use the key that we were going to put on the + * outgoing packet for the fake received packet. If the keys are + * not symmetric then PMTUD needs to be disabled since we won't have + * any way of synthesizing packets. + */ + if ((mutable->port_config.flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) == + (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) + OVS_CB(nskb)->tun_id = flow_key; + + compute_ip_summed(nskb, false); + vport_receive(vport, nskb); + + return true; +} + +static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom) +{ + if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) { + struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16); + if (unlikely(!nskb)) { + kfree_skb(skb); + return ERR_PTR(-ENOMEM); + } + + set_skb_csum_bits(skb, nskb); + + if (skb->sk) + skb_set_owner_w(nskb, skb->sk); + + dev_kfree_skb(skb); + return nskb; + } + + return skb; +} + +static inline u8 ecn_encapsulate(u8 tos, struct sk_buff *skb) +{ + u8 inner; + + if (skb->protocol == htons(ETH_P_IP)) + inner = ((struct iphdr *)skb_network_header(skb))->tos; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb)); +#endif + else + inner = 0; + + return INET_ECN_encapsulate(tos, inner); +} + +static inline void ecn_decapsulate(struct sk_buff *skb) +{ + u8 tos = ip_hdr(skb)->tos; + + if (INET_ECN_is_ce(tos)) { + __be16 protocol = skb->protocol; + unsigned int nw_header = skb_network_header(skb) - skb->data; + + if (skb->protocol == htons(ETH_P_8021Q)) { + if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) + return; + + protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + nw_header += VLAN_HLEN; + } + + if (protocol == htons(ETH_P_IP)) { + if (unlikely(!pskb_may_pull(skb, nw_header + + sizeof(struct iphdr)))) + return; + + IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (protocol == htons(ETH_P_IPV6)) { + if (unlikely(!pskb_may_pull(skb, nw_header + + sizeof(struct ipv6hdr)))) + return; + + IP6_ECN_set_ce((struct ipv6hdr *)(nw_header + + skb->data)); + } +#endif + } +} + +static struct sk_buff *handle_gso(struct sk_buff *skb) +{ + if (skb_is_gso(skb)) { + struct sk_buff *nskb = skb_gso_segment(skb, 0); + + dev_kfree_skb(skb); + return nskb; + } + + return skb; +} + +static int handle_csum_offload(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_PARTIAL) + return skb_checksum_help(skb); + else { + skb->ip_summed = CHECKSUM_NONE; + return 0; + } +} + +/* Called with rcu_read_lock. */ +void tnl_rcv(struct vport *vport, struct sk_buff *skb) +{ + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb_dst_drop(skb); + nf_reset(skb); + secpath_reset(skb); + skb_reset_network_header(skb); + + ecn_decapsulate(skb); + + skb_push(skb, ETH_HLEN); + compute_ip_summed(skb, false); + + vport_receive(vport, skb); +} + +static int build_packet(struct vport *vport, const struct tnl_mutable_config *mutable, + struct iphdr *iph, struct rtable *rt, int max_headroom, + int mtu, struct sk_buff *skb) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + int err; + struct iphdr *new_iph; + int orig_len = skb->len; + __be16 frag_off = iph->frag_off; + + skb = check_headroom(skb, max_headroom); + if (unlikely(IS_ERR(skb))) + goto error; + + err = handle_csum_offload(skb); + if (unlikely(err)) + goto error_free; + + if (skb->protocol == htons(ETH_P_IP)) { + struct iphdr *old_iph = ip_hdr(skb); + + if ((old_iph->frag_off & htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id)) + goto error_free; + } + + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) { + unsigned int packet_length = skb->len - ETH_HLEN + - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0); + + /* IPv6 requires PMTUD if the packet is above the minimum MTU. */ + if (packet_length > IPV6_MIN_MTU) + frag_off = htons(IP_DF); + + if (mtu < packet_length) { + if (tnl_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id)) + goto error_free; + } + } +#endif + + new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen); + skb_reset_network_header(skb); + skb_set_transport_header(skb, sizeof(struct iphdr)); + + memcpy(new_iph, iph, sizeof(struct iphdr)); + new_iph->frag_off = frag_off; + ip_select_ident(new_iph, &rt_dst(rt), NULL); + + tnl_vport->tnl_ops->build_header(skb, vport, mutable); + + /* Allow our local IP stack to fragment the outer packet even if the + * DF bit is set as a last resort. */ + skb->local_df = 1; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags = 0; + + err = ip_local_out(skb); + if (likely(net_xmit_eval(err) == 0)) + return orig_len; + else { + vport_record_error(vport, VPORT_E_TX_ERROR); + return 0; + } + +error_free: + kfree_skb(skb); +error: + vport_record_error(vport, VPORT_E_TX_DROPPED); + + return 0; +} + +int tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable); + + struct iphdr *old_iph; + int orig_len; + struct iphdr iph; + struct rtable *rt; + int max_headroom; + int mtu; + + /* Validate the protocol headers before we try to use them. */ + if (skb->protocol == htons(ETH_P_8021Q)) { + if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) + goto error_free; + + skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + skb_set_network_header(skb, VLAN_ETH_HLEN); + } + + if (skb->protocol == htons(ETH_P_IP)) { + if (unlikely(!pskb_may_pull(skb, skb_network_header(skb) + + sizeof(struct iphdr) - skb->data))) + skb->protocol = 0; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) { + if (unlikely(!pskb_may_pull(skb, skb_network_header(skb) + + sizeof(struct ipv6hdr) - skb->data))) + skb->protocol = 0; + } +#endif + old_iph = ip_hdr(skb); + + iph.tos = mutable->port_config.tos; + if (mutable->port_config.flags & TNL_F_TOS_INHERIT) { + if (skb->protocol == htons(ETH_P_IP)) + iph.tos = old_iph->tos; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + iph.tos = ipv6_get_dsfield(ipv6_hdr(skb)); +#endif + } + iph.tos = ecn_encapsulate(iph.tos, skb); + + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = mutable->port_config.daddr, + .saddr = mutable->port_config.saddr, + .tos = RT_TOS(iph.tos) } }, + .proto = tnl_vport->tnl_ops->ipproto }; + + if (unlikely(ip_route_output_key(&init_net, &rt, &fl))) + goto error_free; + } + + iph.ttl = mutable->port_config.ttl; + if (mutable->port_config.flags & TNL_F_TTL_INHERIT) { + if (skb->protocol == htons(ETH_P_IP)) + iph.ttl = old_iph->ttl; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + iph.ttl = ipv6_hdr(skb)->hop_limit; +#endif + } + if (!iph.ttl) + iph.ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT); + + iph.frag_off = (mutable->port_config.flags & TNL_F_PMTUD) ? htons(IP_DF) : 0; + if (iph.frag_off) + mtu = dst_mtu(&rt_dst(rt)) + - ETH_HLEN + - mutable->tunnel_hlen + - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0); + else + mtu = mutable->mtu; + + if (skb->protocol == htons(ETH_P_IP)) { + iph.frag_off |= old_iph->frag_off & htons(IP_DF); + mtu = max(mtu, IP_MIN_MTU); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + mtu = max(mtu, IPV6_MIN_MTU); +#endif + + iph.version = 4; + iph.ihl = sizeof(struct iphdr) >> 2; + iph.protocol = tnl_vport->tnl_ops->ipproto; + iph.daddr = rt->rt_dst; + iph.saddr = rt->rt_src; + + nf_reset(skb); + secpath_reset(skb); + skb_dst_drop(skb); + skb_dst_set(skb, &rt_dst(rt)); + + /* + * If we are doing GSO on a pskb it is better to make sure that the + * headroom is correct now. We will only have to copy the portion in + * the linear data area and GSO will preserve headroom when it creates + * the segments. This is particularly beneficial on Xen where we get + * lots of GSO pskbs. Conversely, we delay copying if it is just to + * get our own writable clone because GSO may do the copy for us. + */ + max_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len + + mutable->tunnel_hlen; + + if (skb_headroom(skb) < max_headroom) { + skb = check_headroom(skb, max_headroom); + if (unlikely(IS_ERR(skb))) { + vport_record_error(vport, VPORT_E_TX_DROPPED); + goto error; + } + } + + forward_ip_summed(skb); + + if (unlikely(vswitch_skb_checksum_setup(skb))) + goto error_free; + + skb = handle_gso(skb); + if (unlikely(IS_ERR(skb))) { + vport_record_error(vport, VPORT_E_TX_DROPPED); + goto error; + } + + /* + * Process GSO segments. Try to do any work for the entire packet that + * doesn't involve actually writing to it before this point. + */ + orig_len = 0; + do { + struct sk_buff *next_skb = skb->next; + skb->next = NULL; + + orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb); + + skb = next_skb; + } while (skb); + + return orig_len; + +error_free: + kfree_skb(skb); + vport_record_error(vport, VPORT_E_TX_ERROR); +error: + return 0; +} + +int tnl_init(void) +{ + return 0; +} + +void tnl_exit(void) +{ + tbl_destroy(port_table, NULL); + port_table = NULL; +} + +static int set_config(const void __user *uconfig, const struct tnl_ops *tnl_ops, + const struct vport *cur_vport, + struct tnl_mutable_config *mutable) +{ + const struct vport *old_vport; + const struct tnl_mutable_config *old_mutable; + + if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct tnl_port_config))) + return -EFAULT; + + mutable->tunnel_hlen = tnl_ops->hdr_len(&mutable->port_config); + if (mutable->tunnel_hlen < 0) + return mutable->tunnel_hlen; + + mutable->tunnel_hlen += sizeof(struct iphdr); + + if (mutable->port_config.daddr == 0) + return -EINVAL; + + mutable->tunnel_type = tnl_ops->tunnel_type; + if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) { + mutable->tunnel_type |= TNL_T_KEY_MATCH; + mutable->port_config.in_key = 0; + } else + mutable->tunnel_type |= TNL_T_KEY_EXACT; + + old_vport = tnl_find_port(mutable->port_config.saddr, + mutable->port_config.daddr, + mutable->port_config.in_key, + mutable->tunnel_type, + &old_mutable); + + if (old_vport && old_vport != cur_vport) + return -EEXIST; + + if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION) + mutable->port_config.out_key = 0; + + return 0; +} + +struct vport *tnl_create(const char *name, const void __user *config, + const struct vport_ops *vport_ops, + const struct tnl_ops *tnl_ops) +{ + struct vport *vport; + struct tnl_vport *tnl_vport; + int err; + + vport = vport_alloc(sizeof(struct tnl_vport), vport_ops); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto error; + } + + tnl_vport = tnl_vport_priv(vport); + + strcpy(tnl_vport->name, name); + tnl_vport->tnl_ops = tnl_ops; + + tnl_vport->mutable = kmalloc(sizeof(struct tnl_mutable_config), GFP_KERNEL); + if (!tnl_vport->mutable) { + err = -ENOMEM; + goto error_free_vport; + } + + vport_gen_rand_ether_addr(tnl_vport->mutable->eth_addr); + tnl_vport->mutable->mtu = ETH_DATA_LEN; + + err = set_config(config, tnl_ops, NULL, tnl_vport->mutable); + if (err) + goto error_free_mutable; + + err = add_port(vport); + if (err) + goto error_free_mutable; + + return vport; + +error_free_mutable: + kfree(tnl_vport->mutable); +error_free_vport: + vport_free(vport); +error: + return ERR_PTR(err); +} + +int tnl_modify(struct vport *vport, const void __user *config) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + struct tnl_mutable_config *mutable; + int err; + bool update_hash = false; + + mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL); + if (!mutable) { + err = -ENOMEM; + goto error; + } + + err = set_config(config, tnl_vport->tnl_ops, vport, mutable); + if (err) + goto error_free; + + /* + * Only remove the port from the hash table if something that would + * affect the lookup has changed. + */ + if (tnl_vport->mutable->port_config.saddr != mutable->port_config.saddr || + tnl_vport->mutable->port_config.daddr != mutable->port_config.daddr || + tnl_vport->mutable->port_config.in_key != mutable->port_config.in_key || + (tnl_vport->mutable->port_config.flags & TNL_F_IN_KEY_MATCH) != + (mutable->port_config.flags & TNL_F_IN_KEY_MATCH)) + update_hash = true; + + + /* + * This update is not atomic but the lookup uses the config, which + * serves as an inherent double check. + */ + if (update_hash) { + err = del_port(vport); + if (err) + goto error_free; + } + + assign_config_rcu(vport, mutable); + + if (update_hash) { + err = add_port(vport); + if (err) + goto error_free; + } + + return 0; + +error_free: + kfree(mutable); +error: + return err; +} + +static void free_port(struct rcu_head *rcu) +{ + struct tnl_vport *tnl_vport = container_of(rcu, struct tnl_vport, rcu); + + kfree(tnl_vport->mutable); + vport_free(tnl_vport_to_vport(tnl_vport)); +} + +int tnl_destroy(struct vport *vport) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + const struct tnl_mutable_config *old_mutable; + + if (vport == tnl_find_port(tnl_vport->mutable->port_config.saddr, + tnl_vport->mutable->port_config.daddr, + tnl_vport->mutable->port_config.in_key, + tnl_vport->mutable->tunnel_type, + &old_mutable)) + del_port(vport); + + call_rcu(&tnl_vport->rcu, free_port); + + return 0; +} + +int tnl_set_mtu(struct vport *vport, int mtu) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + struct tnl_mutable_config *mutable; + + mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL); + if (!mutable) + return -ENOMEM; + + mutable->mtu = mtu; + assign_config_rcu(vport, mutable); + + return 0; +} + +int tnl_set_addr(struct vport *vport, const unsigned char *addr) +{ + struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + struct tnl_mutable_config *mutable; + + mutable = kmemdup(tnl_vport->mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL); + if (!mutable) + return -ENOMEM; + + memcpy(mutable->eth_addr, addr, ETH_ALEN); + assign_config_rcu(vport, mutable); + + return 0; +} + + +const char *tnl_get_name(const struct vport *vport) +{ + const struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + return tnl_vport->name; +} + +const unsigned char *tnl_get_addr(const struct vport *vport) +{ + const struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + return rcu_dereference(tnl_vport->mutable)->eth_addr; +} + +int tnl_get_mtu(const struct vport *vport) +{ + const struct tnl_vport *tnl_vport = tnl_vport_priv(vport); + return rcu_dereference(tnl_vport->mutable)->mtu; +} diff --git a/datapath/tunnel.h b/datapath/tunnel.h new file mode 100644 index 00000000..89e73bac --- /dev/null +++ b/datapath/tunnel.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2010 Nicira Networks. + * Distributed under the terms of the GNU GPL version 2. + * + * Significant portions of this file may be copied from parts of the Linux + * kernel, by Linus Torvalds and others. + */ + +#ifndef TUNNEL_H +#define TUNNEL_H 1 + +#include "openvswitch/tunnel.h" +#include "table.h" +#include "vport.h" + +/* The absolute minimum fragment size. Note that there are many other + * definitions of the minimum MTU. */ +#define IP_MIN_MTU 68 + +/* + * One of these goes in your struct tnl_ops and in tnl_find_port(). + * These values are in the same namespace as other TNL_T_* values, so + * you have only the first 10 bits to define protocol identifiers. + */ +#define TNL_T_PROTO_GRE 0 + +/* You only need these flags when you are calling tnl_find_port(). */ +#define TNL_T_KEY_EXACT (1 << 10) +#define TNL_T_KEY_MATCH (1 << 11) +#define TNL_T_KEY_EITHER (TNL_T_KEY_EXACT | TNL_T_KEY_MATCH) + +struct tnl_mutable_config { + struct rcu_head rcu; + + unsigned char eth_addr[ETH_ALEN]; + unsigned int mtu; + struct tnl_port_config port_config; + + /* Set of TNL_T_* flags that define the category for lookup. */ + u32 tunnel_type; + + int tunnel_hlen; /* Tunnel header length. */ +}; + +struct tnl_ops { + /* Put your TNL_T_PROTO_* type in here. */ + u32 tunnel_type; + u8 ipproto; + + int (*hdr_len)(const struct tnl_port_config *); + void (*build_header)(struct sk_buff *, const struct vport *, + const struct tnl_mutable_config *); +}; + +struct tnl_vport { + struct rcu_head rcu; + struct tbl_node tbl_node; + + char name[IFNAMSIZ]; + const struct tnl_ops *tnl_ops; + + /* Protected by RCU. */ + struct tnl_mutable_config *mutable; +}; + +int tnl_init(void); +void tnl_exit(void); +struct vport *tnl_create(const char *name, const void __user *config, + const struct vport_ops *, + const struct tnl_ops *); +int tnl_modify(struct vport *, const void __user *config); +int tnl_destroy(struct vport *); +int tnl_set_mtu(struct vport *vport, int mtu); +int tnl_set_addr(struct vport *vport, const unsigned char *addr); +const char *tnl_get_name(const struct vport *vport); +const unsigned char *tnl_get_addr(const struct vport *vport); +int tnl_get_mtu(const struct vport *vport); +int tnl_send(struct vport *vport, struct sk_buff *skb); +void tnl_rcv(struct vport *vport, struct sk_buff *skb); + +struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be32 key, + int tunnel_type, + const struct tnl_mutable_config **mutable); +bool tnl_frag_needed(struct vport *vport, + const struct tnl_mutable_config *mutable, + struct sk_buff *skb, unsigned int mtu, __be32 flow_key); + +static inline struct tnl_vport *tnl_vport_priv(const struct vport *vport) +{ + return vport_priv(vport); +} + +#endif /* tunnel.h */ diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c index 2ab08f11..31d2d4f6 100644 --- a/datapath/vport-gre.c +++ b/datapath/vport-gre.c @@ -6,580 +6,54 @@ * kernel, by Linus Torvalds and others. */ -#include -#include +#include +#include #include #include #include #include -#include -#include -#include -#include -#include -#include #include -#include #include -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -#include -#endif #include -#include -#include -#include "actions.h" -#include "datapath.h" -#include "openvswitch/gre.h" -#include "table.h" +#include "tunnel.h" #include "vport.h" #include "vport-generic.h" -/* The absolute minimum fragment size. Note that there are many other - * definitions of the minimum MTU. */ -#define IP_MIN_MTU 68 - -/* The GRE header is composed of a series of sections: a base and then a variable - * number of options. */ +/* + * The GRE header is composed of a series of sections: a base and then a variable + * number of options. + */ #define GRE_HEADER_SECTION 4 -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) -#define rt_dst(rt) (rt->dst) -#else -#define rt_dst(rt) (rt->u.dst) -#endif - struct gre_base_hdr { __be16 flags; __be16 protocol; }; -struct mutable_config { - struct rcu_head rcu; - - unsigned char eth_addr[ETH_ALEN]; - unsigned int mtu; - struct gre_port_config port_config; - - int tunnel_hlen; /* Tunnel header length. */ -}; - -struct gre_vport { - struct rcu_head rcu; - struct tbl_node tbl_node; - - char name[IFNAMSIZ]; - - /* Protected by RCU. */ - struct mutable_config *mutable; -}; - -/* Protected by RCU. */ -static struct tbl *port_table; - -/* These are just used as an optimization: they don't require any kind of - * synchronization because we could have just as easily read the value before - * the port change happened. */ -static unsigned int key_local_remote_ports; -static unsigned int key_remote_ports; -static unsigned int local_remote_ports; -static unsigned int remote_ports; - -static inline struct gre_vport *gre_vport_priv(const struct vport *vport) -{ - return vport_priv(vport); -} - -static inline struct vport *gre_vport_to_vport(const struct gre_vport *gre_vport) -{ - return vport_from_priv(gre_vport); -} - -static inline struct gre_vport *gre_vport_table_cast(const struct tbl_node *node) -{ - return container_of(node, struct gre_vport, tbl_node); -} - -/* RCU callback. */ -static void free_config(struct rcu_head *rcu) -{ - struct mutable_config *c = container_of(rcu, struct mutable_config, rcu); - kfree(c); -} - -static void assign_config_rcu(struct vport *vport, - struct mutable_config *new_config) +static int gre_hdr_len(const struct tnl_port_config *port_config) { - struct gre_vport *gre_vport = gre_vport_priv(vport); - struct mutable_config *old_config; - - old_config = rcu_dereference(gre_vport->mutable); - rcu_assign_pointer(gre_vport->mutable, new_config); - call_rcu(&old_config->rcu, free_config); -} - -static unsigned int *find_port_pool(const struct mutable_config *mutable) -{ - if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH) { - if (mutable->port_config.saddr) - return &local_remote_ports; - else - return &remote_ports; - } else { - if (mutable->port_config.saddr) - return &key_local_remote_ports; - else - return &key_remote_ports; - } -} + int len; -enum lookup_key { - LOOKUP_SADDR = 0, - LOOKUP_DADDR = 1, - LOOKUP_KEY = 2, - LOOKUP_KEY_MATCH = 3 -}; + len = GRE_HEADER_SECTION; -struct port_lookup_key { - u32 vals[4]; /* Contains enum lookup_key keys. */ - const struct mutable_config *mutable; -}; - -/* Modifies 'target' to store the rcu_dereferenced pointer that was used to do - * the comparision. */ -static int port_cmp(const struct tbl_node *node, void *target) -{ - const struct gre_vport *gre_vport = gre_vport_table_cast(node); - struct port_lookup_key *lookup = target; + if (port_config->flags & TNL_F_CSUM) + len += GRE_HEADER_SECTION; - lookup->mutable = rcu_dereference(gre_vport->mutable); + if (port_config->out_key || + port_config->flags & TNL_F_OUT_KEY_ACTION) + len += GRE_HEADER_SECTION; - return ((lookup->mutable->port_config.flags & GRE_F_IN_KEY_MATCH) == - lookup->vals[LOOKUP_KEY_MATCH]) && - lookup->mutable->port_config.daddr == lookup->vals[LOOKUP_DADDR] && - lookup->mutable->port_config.in_key == lookup->vals[LOOKUP_KEY] && - lookup->mutable->port_config.saddr == lookup->vals[LOOKUP_SADDR]; + return len; } -static u32 port_hash(struct port_lookup_key *lookup) +static void gre_build_header(struct sk_buff *skb, + const struct vport *vport, + const struct tnl_mutable_config *mutable) { - return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0); -} - -static int add_port(struct vport *vport) -{ - struct gre_vport *gre_vport = gre_vport_priv(vport); - struct port_lookup_key lookup; - int err; - - if (!port_table) { - struct tbl *new_table; - - new_table = tbl_create(0); - if (!new_table) - return -ENOMEM; - - rcu_assign_pointer(port_table, new_table); - - } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) { - struct tbl *old_table = port_table; - struct tbl *new_table; - - new_table = tbl_expand(old_table); - if (IS_ERR(new_table)) - return PTR_ERR(new_table); - - rcu_assign_pointer(port_table, new_table); - tbl_deferred_destroy(old_table, NULL); - } - - lookup.vals[LOOKUP_SADDR] = gre_vport->mutable->port_config.saddr; - lookup.vals[LOOKUP_DADDR] = gre_vport->mutable->port_config.daddr; - lookup.vals[LOOKUP_KEY] = gre_vport->mutable->port_config.in_key; - lookup.vals[LOOKUP_KEY_MATCH] = gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH; - - err = tbl_insert(port_table, &gre_vport->tbl_node, port_hash(&lookup)); - if (err) - return err; - - (*find_port_pool(gre_vport->mutable))++; - - return 0; -} - -static int del_port(struct vport *vport) -{ - struct gre_vport *gre_vport = gre_vport_priv(vport); - int err; - - err = tbl_remove(port_table, &gre_vport->tbl_node); - if (err) - return err; - - (*find_port_pool(gre_vport->mutable))--; - - return 0; -} - -#define FIND_PORT_KEY (1 << 0) -#define FIND_PORT_MATCH (1 << 1) -#define FIND_PORT_ANY (FIND_PORT_KEY | FIND_PORT_MATCH) - -static struct vport *find_port(__be32 saddr, __be32 daddr, __be32 key, - int port_type, - const struct mutable_config **mutable) -{ - struct port_lookup_key lookup; - struct tbl *table = rcu_dereference(port_table); - struct tbl_node *tbl_node; - - if (!table) - return NULL; - - lookup.vals[LOOKUP_SADDR] = saddr; - lookup.vals[LOOKUP_DADDR] = daddr; - - if (port_type & FIND_PORT_KEY) { - lookup.vals[LOOKUP_KEY] = key; - lookup.vals[LOOKUP_KEY_MATCH] = 0; - - if (key_local_remote_ports) { - tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp); - if (tbl_node) - goto found; - } - - if (key_remote_ports) { - lookup.vals[LOOKUP_SADDR] = 0; - - tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp); - if (tbl_node) - goto found; - - lookup.vals[LOOKUP_SADDR] = saddr; - } - } - - if (port_type & FIND_PORT_MATCH) { - lookup.vals[LOOKUP_KEY] = 0; - lookup.vals[LOOKUP_KEY_MATCH] = GRE_F_IN_KEY_MATCH; - - if (local_remote_ports) { - tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp); - if (tbl_node) - goto found; - } - - if (remote_ports) { - lookup.vals[LOOKUP_SADDR] = 0; - - tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp); - if (tbl_node) - goto found; - } - } - - return NULL; - -found: - *mutable = lookup.mutable; - return gre_vport_to_vport(gre_vport_table_cast(tbl_node)); -} - -static bool check_ipv4_address(__be32 addr) -{ - if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) - || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr)) - return false; - - return true; -} - -static bool ipv4_should_icmp(struct sk_buff *skb) -{ - struct iphdr *old_iph = ip_hdr(skb); - - /* Don't respond to L2 broadcast. */ - if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) - return false; - - /* Don't respond to L3 broadcast or invalid addresses. */ - if (!check_ipv4_address(old_iph->daddr) || - !check_ipv4_address(old_iph->saddr)) - return false; - - /* Only respond to the first fragment. */ - if (old_iph->frag_off & htons(IP_OFFSET)) - return false; - - /* Don't respond to ICMP error messages. */ - if (old_iph->protocol == IPPROTO_ICMP) { - u8 icmp_type, *icmp_typep; - - icmp_typep = skb_header_pointer(skb, (u8 *)old_iph + - (old_iph->ihl << 2) + - offsetof(struct icmphdr, type) - - skb->data, sizeof(icmp_type), - &icmp_type); - - if (!icmp_typep) - return false; - - if (*icmp_typep > NR_ICMP_TYPES - || (*icmp_typep <= ICMP_PARAMETERPROB - && *icmp_typep != ICMP_ECHOREPLY - && *icmp_typep != ICMP_ECHO)) - return false; - } - - return true; -} - -static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb, - unsigned int mtu, unsigned int payload_length) -{ - struct iphdr *iph, *old_iph = ip_hdr(skb); - struct icmphdr *icmph; - u8 *payload; - - iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); - icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr)); - payload = skb_put(nskb, payload_length); - - /* IP */ - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->tos = (old_iph->tos & IPTOS_TOS_MASK) | - IPTOS_PREC_INTERNETCONTROL; - iph->tot_len = htons(sizeof(struct iphdr) - + sizeof(struct icmphdr) - + payload_length); - get_random_bytes(&iph->id, sizeof(iph->id)); - iph->frag_off = 0; - iph->ttl = IPDEFTTL; - iph->protocol = IPPROTO_ICMP; - iph->daddr = old_iph->saddr; - iph->saddr = old_iph->daddr; - - ip_send_check(iph); - - /* ICMP */ - icmph->type = ICMP_DEST_UNREACH; - icmph->code = ICMP_FRAG_NEEDED; - icmph->un.gateway = htonl(mtu); - icmph->checksum = 0; - - nskb->csum = csum_partial((u8 *)icmph, sizeof(struct icmphdr), 0); - nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data, - payload, payload_length, - nskb->csum); - icmph->checksum = csum_fold(nskb->csum); -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static bool ipv6_should_icmp(struct sk_buff *skb) -{ - struct ipv6hdr *old_ipv6h = ipv6_hdr(skb); - int addr_type; - int payload_off = (u8 *)(old_ipv6h + 1) - skb->data; - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - - /* Check source address is valid. */ - addr_type = ipv6_addr_type(&old_ipv6h->saddr); - if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY) - return false; - - /* Don't reply to unspecified addresses. */ - if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY) - return false; - - /* Don't respond to ICMP error messages. */ - payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr); - if (payload_off < 0) - return false; - - if (nexthdr == NEXTHDR_ICMP) { - u8 icmp_type, *icmp_typep; - - icmp_typep = skb_header_pointer(skb, payload_off + - offsetof(struct icmp6hdr, - icmp6_type), - sizeof(icmp_type), &icmp_type); - - if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK)) - return false; - } - - return true; -} - -static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb, - unsigned int mtu, unsigned int payload_length) -{ - struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb); - struct icmp6hdr *icmp6h; - u8 *payload; - - ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr)); - icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr)); - payload = skb_put(nskb, payload_length); - - /* IPv6 */ - ipv6h->version = 6; - ipv6h->priority = 0; - memset(&ipv6h->flow_lbl, 0, sizeof(ipv6h->flow_lbl)); - ipv6h->payload_len = htons(sizeof(struct icmp6hdr) - + payload_length); - ipv6h->nexthdr = NEXTHDR_ICMP; - ipv6h->hop_limit = IPV6_DEFAULT_HOPLIMIT; - ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr); - ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr); - - /* ICMPv6 */ - icmp6h->icmp6_type = ICMPV6_PKT_TOOBIG; - icmp6h->icmp6_code = 0; - icmp6h->icmp6_cksum = 0; - icmp6h->icmp6_mtu = htonl(mtu); - - nskb->csum = csum_partial((u8 *)icmp6h, sizeof(struct icmp6hdr), 0); - nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data, - payload, payload_length, - nskb->csum); - icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, - sizeof(struct icmp6hdr) - + payload_length, - ipv6h->nexthdr, nskb->csum); -} -#endif /* IPv6 */ - -static bool send_frag_needed(struct vport *vport, - const struct mutable_config *mutable, - struct sk_buff *skb, unsigned int mtu, - __be32 flow_key) -{ - unsigned int eth_hdr_len = ETH_HLEN; - unsigned int total_length = 0, header_length = 0, payload_length; - struct ethhdr *eh, *old_eh = eth_hdr(skb); - struct sk_buff *nskb; - - /* Sanity check */ - if (skb->protocol == htons(ETH_P_IP)) { - if (mtu < IP_MIN_MTU) - return false; - - if (!ipv4_should_icmp(skb)) - return true; - } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (skb->protocol == htons(ETH_P_IPV6)) { - if (mtu < IPV6_MIN_MTU) - return false; - - /* In theory we should do PMTUD on IPv6 multicast messages but - * we don't have an address to send from so just fragment. */ - if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) - return false; - - if (!ipv6_should_icmp(skb)) - return true; - } -#endif - else - return false; - - /* Allocate */ - if (old_eh->h_proto == htons(ETH_P_8021Q)) - eth_hdr_len = VLAN_ETH_HLEN; - - payload_length = skb->len - eth_hdr_len; - if (skb->protocol == htons(ETH_P_IP)) { - header_length = sizeof(struct iphdr) + sizeof(struct icmphdr); - total_length = min_t(unsigned int, header_length + - payload_length, 576); - } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else { - header_length = sizeof(struct ipv6hdr) + - sizeof(struct icmp6hdr); - total_length = min_t(unsigned int, header_length + - payload_length, IPV6_MIN_MTU); - } -#endif - - total_length = min(total_length, mutable->mtu); - payload_length = total_length - header_length; - - nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length + - payload_length); - if (!nskb) - return false; - - skb_reserve(nskb, NET_IP_ALIGN); - - /* Ethernet / VLAN */ - eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len); - memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN); - memcpy(eh->h_source, mutable->eth_addr, ETH_ALEN); - nskb->protocol = eh->h_proto = old_eh->h_proto; - if (old_eh->h_proto == htons(ETH_P_8021Q)) { - struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh; - - vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI; - vh->h_vlan_encapsulated_proto = skb->protocol; - } - skb_reset_mac_header(nskb); - - /* Protocol */ - if (skb->protocol == htons(ETH_P_IP)) - ipv4_build_icmp(skb, nskb, mtu, payload_length); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else - ipv6_build_icmp(skb, nskb, mtu, payload_length); -#endif - - /* Assume that flow based keys are symmetric with respect to input - * and output and use the key that we were going to put on the - * outgoing packet for the fake received packet. If the keys are - * not symmetric then PMTUD needs to be disabled since we won't have - * any way of synthesizing packets. */ - if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH && - mutable->port_config.flags & GRE_F_OUT_KEY_ACTION) - OVS_CB(nskb)->tun_id = flow_key; - - compute_ip_summed(nskb, false); - vport_receive(vport, nskb); - - return true; -} - -static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom) -{ - if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) { - struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16); - if (!nskb) { - kfree_skb(skb); - return ERR_PTR(-ENOMEM); - } - - set_skb_csum_bits(skb, nskb); - - if (skb->sk) - skb_set_owner_w(nskb, skb->sk); - - dev_kfree_skb(skb); - return nskb; - } - - return skb; -} - -static void create_gre_header(struct sk_buff *skb, - const struct mutable_config *mutable) -{ - struct iphdr *iph = ip_hdr(skb); - struct gre_base_hdr *greh = (struct gre_base_hdr *)(iph + 1); - __be32 *options = (__be32 *)((u8 *)iph + mutable->tunnel_hlen + struct gre_base_hdr *greh = (struct gre_base_hdr *)skb_transport_header(skb); + __be32 *options = (__be32 *)(skb_network_header(skb) + mutable->tunnel_hlen - GRE_HEADER_SECTION); greh->protocol = htons(ETH_P_TEB); @@ -587,10 +61,10 @@ static void create_gre_header(struct sk_buff *skb, /* Work backwards over the options so the checksum is last. */ if (mutable->port_config.out_key || - mutable->port_config.flags & GRE_F_OUT_KEY_ACTION) { + mutable->port_config.flags & TNL_F_OUT_KEY_ACTION) { greh->flags |= GRE_KEY; - if (mutable->port_config.flags & GRE_F_OUT_KEY_ACTION) + if (mutable->port_config.flags & TNL_F_OUT_KEY_ACTION) *options = OVS_CB(skb)->tun_id; else *options = mutable->port_config.out_key; @@ -598,7 +72,7 @@ static void create_gre_header(struct sk_buff *skb, options--; } - if (mutable->port_config.flags & GRE_F_CSUM) { + if (mutable->port_config.flags & TNL_F_CSUM) { greh->flags |= GRE_CSUM; *options = 0; @@ -609,33 +83,7 @@ static void create_gre_header(struct sk_buff *skb, } } -static int check_checksum(struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - __be16 flags = *(__be16 *)(iph + 1); - __sum16 csum = 0; - - if (flags & GRE_CSUM) { - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - csum = csum_fold(skb->csum); - - if (!csum) - break; - /* Fall through. */ - - case CHECKSUM_NONE: - skb->csum = 0; - csum = __skb_checksum_complete(skb); - skb->ip_summed = CHECKSUM_COMPLETE; - break; - } - } - - return (csum == 0); -} - -static int parse_gre_header(struct iphdr *iph, __be16 *flags, __be32 *key) +static int parse_header(struct iphdr *iph, __be16 *flags, __be32 *key) { /* IP and ICMP protocol handlers check that the IHL is valid. */ struct gre_base_hdr *greh = (struct gre_base_hdr *)((u8 *)iph + (iph->ihl << 2)); @@ -644,10 +92,10 @@ static int parse_gre_header(struct iphdr *iph, __be16 *flags, __be32 *key) *flags = greh->flags; - if (greh->flags & (GRE_VERSION | GRE_ROUTING)) + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; - if (greh->protocol != htons(ETH_P_TEB)) + if (unlikely(greh->protocol != htons(ETH_P_TEB))) return -EINVAL; hdr_len = GRE_HEADER_SECTION; @@ -665,89 +113,17 @@ static int parse_gre_header(struct iphdr *iph, __be16 *flags, __be32 *key) } else *key = 0; - if (greh->flags & GRE_SEQ) + if (unlikely(greh->flags & GRE_SEQ)) hdr_len += GRE_HEADER_SECTION; return hdr_len; } -static inline u8 ecn_encapsulate(u8 tos, struct sk_buff *skb) -{ - u8 inner; - - if (skb->protocol == htons(ETH_P_IP)) - inner = ((struct iphdr *)skb_network_header(skb))->tos; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (skb->protocol == htons(ETH_P_IPV6)) - inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb)); -#endif - else - inner = 0; - - return INET_ECN_encapsulate(tos, inner); -} - -static inline void ecn_decapsulate(u8 tos, struct sk_buff *skb) -{ - if (INET_ECN_is_ce(tos)) { - __be16 protocol = skb->protocol; - unsigned int nw_header = skb_network_header(skb) - skb->data; - - if (skb->protocol == htons(ETH_P_8021Q)) { - if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) - return; - - protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; - nw_header += VLAN_HLEN; - } - - if (protocol == htons(ETH_P_IP)) { - if (unlikely(!pskb_may_pull(skb, nw_header - + sizeof(struct iphdr)))) - return; - - IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data)); - } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (protocol == htons(ETH_P_IPV6)) { - if (unlikely(!pskb_may_pull(skb, nw_header - + sizeof(struct ipv6hdr)))) - return; - - IP6_ECN_set_ce((struct ipv6hdr *)(nw_header - + skb->data)); - } -#endif - } -} - -static struct sk_buff *handle_gso(struct sk_buff *skb) -{ - if (skb_is_gso(skb)) { - struct sk_buff *nskb = skb_gso_segment(skb, 0); - - dev_kfree_skb(skb); - return nskb; - } - - return skb; -} - -static int handle_csum_offload(struct sk_buff *skb) -{ - if (skb->ip_summed == CHECKSUM_PARTIAL) - return skb_checksum_help(skb); - else { - skb->ip_summed = CHECKSUM_NONE; - return 0; - } -} - -/* Called with rcu_read_lock. */ +/* Called with rcu_read_lock and BH disabled. */ static void gre_err(struct sk_buff *skb, u32 info) { struct vport *vport; - const struct mutable_config *mutable; + const struct tnl_mutable_config *mutable; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; int mtu = ntohs(icmp_hdr(skb)->un.frag.mtu); @@ -762,38 +138,43 @@ static void gre_err(struct sk_buff *skb, u32 info) if (type != ICMP_DEST_UNREACH || code != ICMP_FRAG_NEEDED) return; - /* The mimimum size packet that we would actually be able to process: + /* + * The mimimum size packet that we would actually be able to process: * encapsulating IP header, minimum GRE header, Ethernet header, - * inner IPv4 header. */ + * inner IPv4 header. + */ if (!pskb_may_pull(skb, sizeof(struct iphdr) + GRE_HEADER_SECTION + ETH_HLEN + sizeof(struct iphdr))) return; iph = (struct iphdr *)skb->data; - tunnel_hdr_len = parse_gre_header(iph, &flags, &key); + tunnel_hdr_len = parse_header(iph, &flags, &key); if (tunnel_hdr_len < 0) return; - vport = find_port(iph->saddr, iph->daddr, key, FIND_PORT_ANY, &mutable); + vport = tnl_find_port(iph->saddr, iph->daddr, key, + TNL_T_PROTO_GRE | TNL_T_KEY_EITHER, &mutable); if (!vport) return; - /* Packets received by this function were previously sent by us, so + /* + * Packets received by this function were previously sent by us, so * any comparisons should be to the output values, not the input. * However, it's not really worth it to have a hash table based on * output keys (especially since ICMP error handling of tunneled packets * isn't that reliable anyways). Therefore, we do a lookup based on the * out key as if it were the in key and then check to see if the input - * and output keys are the same. */ + * and output keys are the same. + */ if (mutable->port_config.in_key != mutable->port_config.out_key) return; - if (!!(mutable->port_config.flags & GRE_F_IN_KEY_MATCH) != - !!(mutable->port_config.flags & GRE_F_OUT_KEY_ACTION)) + if (!!(mutable->port_config.flags & TNL_F_IN_KEY_MATCH) != + !!(mutable->port_config.flags & TNL_F_OUT_KEY_ACTION)) return; - if ((mutable->port_config.flags & GRE_F_CSUM) && !(flags & GRE_CSUM)) + if ((mutable->port_config.flags & TNL_F_CSUM) && !(flags & GRE_CSUM)) return; tunnel_hdr_len += iph->ihl << 2; @@ -849,9 +230,9 @@ static void gre_err(struct sk_buff *skb, u32 info) } #endif - __pskb_pull(skb, tunnel_hdr_len); - send_frag_needed(vport, mutable, skb, mtu, key); - skb_push(skb, tunnel_hdr_len); + __skb_pull(skb, tunnel_hdr_len); + tnl_frag_needed(vport, mutable, skb, mtu, key); + __skb_push(skb, tunnel_hdr_len); out: skb_set_mac_header(skb, orig_mac_header); @@ -859,60 +240,72 @@ out: skb->protocol = htons(ETH_P_IP); } -/* Called with rcu_read_lock. */ +static bool check_checksum(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + struct gre_base_hdr *greh = (struct gre_base_hdr *)(iph + 1); + __sum16 csum = 0; + + if (greh->flags & GRE_CSUM) { + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + csum = csum_fold(skb->csum); + + if (!csum) + break; + /* Fall through. */ + + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); + skb->ip_summed = CHECKSUM_COMPLETE; + break; + } + } + + return (csum == 0); +} + +/* Called with rcu_read_lock and BH disabled. */ static int gre_rcv(struct sk_buff *skb) { struct vport *vport; - const struct mutable_config *mutable; + const struct tnl_mutable_config *mutable; int hdr_len; struct iphdr *iph; __be16 flags; __be32 key; - if (!pskb_may_pull(skb, GRE_HEADER_SECTION + ETH_HLEN)) + if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr) + ETH_HLEN))) goto error; - if (!check_checksum(skb)) + if (unlikely(!check_checksum(skb))) goto error; - iph = ip_hdr(skb); - - hdr_len = parse_gre_header(iph, &flags, &key); - if (hdr_len < 0) + hdr_len = parse_header(ip_hdr(skb), &flags, &key); + if (unlikely(hdr_len < 0)) goto error; - vport = find_port(iph->daddr, iph->saddr, key, FIND_PORT_ANY, &mutable); - if (!vport) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + if (unlikely(!pskb_may_pull(skb, hdr_len + ETH_HLEN))) goto error; - } - if (!pskb_pull(skb, hdr_len) || !pskb_may_pull(skb, ETH_HLEN)) { - vport_record_error(vport, VPORT_E_RX_ERROR); + iph = ip_hdr(skb); + vport = tnl_find_port(iph->daddr, iph->saddr, key, + TNL_T_PROTO_GRE | TNL_T_KEY_EITHER, &mutable); + if (unlikely(!vport)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); goto error; } - skb->pkt_type = PACKET_HOST; - skb->protocol = eth_type_trans(skb, skb->dev); - skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN); - - skb_dst_drop(skb); - nf_reset(skb); - secpath_reset(skb); - skb_reset_network_header(skb); - - ecn_decapsulate(iph->tos, skb); - - if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH) + if (mutable->port_config.flags & TNL_F_IN_KEY_MATCH) OVS_CB(skb)->tun_id = key; else OVS_CB(skb)->tun_id = 0; - skb_push(skb, ETH_HLEN); - compute_ip_summed(skb, false); - - vport_receive(vport, skb); + __skb_pull(skb, hdr_len); + skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN); + tnl_rcv(vport, skb); return 0; error: @@ -920,227 +313,16 @@ error: return 0; } -static int build_packet(struct vport *vport, const struct mutable_config *mutable, - struct iphdr *iph, struct rtable *rt, int max_headroom, - int mtu, struct sk_buff *skb) -{ - int err; - struct iphdr *new_iph; - int orig_len = skb->len; - __be16 frag_off = iph->frag_off; - - skb = check_headroom(skb, max_headroom); - if (unlikely(IS_ERR(skb))) - goto error; - - err = handle_csum_offload(skb); - if (err) - goto error_free; - - if (skb->protocol == htons(ETH_P_IP)) { - struct iphdr *old_iph = ip_hdr(skb); - - if ((old_iph->frag_off & htons(IP_DF)) && - mtu < ntohs(old_iph->tot_len)) { - if (send_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id)) - goto error_free; - } - - } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (skb->protocol == htons(ETH_P_IPV6)) { - unsigned int packet_length = skb->len - ETH_HLEN - - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0); - - /* IPv6 requires PMTUD if the packet is above the minimum MTU. */ - if (packet_length > IPV6_MIN_MTU) - frag_off = htons(IP_DF); - - if (mtu < packet_length) { - if (send_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id)) - goto error_free; - } - } -#endif - - skb_reset_transport_header(skb); - new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen); - skb_reset_network_header(skb); - - memcpy(new_iph, iph, sizeof(struct iphdr)); - new_iph->frag_off = frag_off; - ip_select_ident(new_iph, &rt_dst(rt), NULL); - - create_gre_header(skb, mutable); - - /* Allow our local IP stack to fragment the outer packet even if the - * DF bit is set as a last resort. */ - skb->local_df = 1; - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags = 0; - - err = ip_local_out(skb); - if (likely(net_xmit_eval(err) == 0)) - return orig_len; - else { - vport_record_error(vport, VPORT_E_TX_ERROR); - return 0; - } - -error_free: - kfree_skb(skb); -error: - vport_record_error(vport, VPORT_E_TX_DROPPED); - - return 0; -} +struct tnl_ops gre_tnl_ops = { + .tunnel_type = TNL_T_PROTO_GRE, + .ipproto = IPPROTO_GRE, + .hdr_len = gre_hdr_len, + .build_header = gre_build_header, +}; -static int gre_send(struct vport *vport, struct sk_buff *skb) +static struct vport *gre_create(const char *name, const void __user *config) { - struct gre_vport *gre_vport = gre_vport_priv(vport); - const struct mutable_config *mutable = rcu_dereference(gre_vport->mutable); - - struct iphdr *old_iph; - int orig_len; - struct iphdr iph; - struct rtable *rt; - int max_headroom; - int mtu; - - /* Validate the protocol headers before we try to use them. */ - if (skb->protocol == htons(ETH_P_8021Q)) { - if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) - goto error_free; - - skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; - skb_set_network_header(skb, VLAN_ETH_HLEN); - } - - if (skb->protocol == htons(ETH_P_IP)) { - if (unlikely(!pskb_may_pull(skb, skb_network_header(skb) - + sizeof(struct iphdr) - skb->data))) - skb->protocol = 0; - } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (skb->protocol == htons(ETH_P_IPV6)) { - if (unlikely(!pskb_may_pull(skb, skb_network_header(skb) - + sizeof(struct ipv6hdr) - skb->data))) - skb->protocol = 0; - } -#endif - old_iph = ip_hdr(skb); - - iph.tos = mutable->port_config.tos; - if (mutable->port_config.flags & GRE_F_TOS_INHERIT) { - if (skb->protocol == htons(ETH_P_IP)) - iph.tos = old_iph->tos; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (skb->protocol == htons(ETH_P_IPV6)) - iph.tos = ipv6_get_dsfield(ipv6_hdr(skb)); -#endif - } - iph.tos = ecn_encapsulate(iph.tos, skb); - - { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = mutable->port_config.daddr, - .saddr = mutable->port_config.saddr, - .tos = RT_TOS(iph.tos) } }, - .proto = IPPROTO_GRE }; - - if (ip_route_output_key(&init_net, &rt, &fl)) - goto error_free; - } - - iph.ttl = mutable->port_config.ttl; - if (mutable->port_config.flags & GRE_F_TTL_INHERIT) { - if (skb->protocol == htons(ETH_P_IP)) - iph.ttl = old_iph->ttl; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (skb->protocol == htons(ETH_P_IPV6)) - iph.ttl = ipv6_hdr(skb)->hop_limit; -#endif - } - if (!iph.ttl) - iph.ttl = dst_metric(&rt_dst(rt), RTAX_HOPLIMIT); - - iph.frag_off = (mutable->port_config.flags & GRE_F_PMTUD) ? htons(IP_DF) : 0; - if (iph.frag_off) - mtu = dst_mtu(&rt_dst(rt)) - - ETH_HLEN - - mutable->tunnel_hlen - - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0); - else - mtu = mutable->mtu; - - if (skb->protocol == htons(ETH_P_IP)) { - iph.frag_off |= old_iph->frag_off & htons(IP_DF); - mtu = max(mtu, IP_MIN_MTU); - } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - else if (skb->protocol == htons(ETH_P_IPV6)) - mtu = max(mtu, IPV6_MIN_MTU); -#endif - - iph.version = 4; - iph.ihl = sizeof(struct iphdr) >> 2; - iph.protocol = IPPROTO_GRE; - iph.daddr = rt->rt_dst; - iph.saddr = rt->rt_src; - - nf_reset(skb); - secpath_reset(skb); - skb_dst_drop(skb); - skb_dst_set(skb, &rt_dst(rt)); - - /* If we are doing GSO on a pskb it is better to make sure that the - * headroom is correct now. We will only have to copy the portion in - * the linear data area and GSO will preserve headroom when it creates - * the segments. This is particularly beneficial on Xen where we get - * lots of GSO pskbs. Conversely, we delay copying if it is just to - * get our own writable clone because GSO may do the copy for us. */ - max_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len - + mutable->tunnel_hlen; - - if (skb_headroom(skb) < max_headroom) { - skb = check_headroom(skb, max_headroom); - if (unlikely(IS_ERR(skb))) { - vport_record_error(vport, VPORT_E_TX_DROPPED); - goto error; - } - } - - forward_ip_summed(skb); - - if (unlikely(vswitch_skb_checksum_setup(skb))) - goto error_free; - - skb = handle_gso(skb); - if (unlikely(IS_ERR(skb))) { - vport_record_error(vport, VPORT_E_TX_DROPPED); - goto error; - } - - /* Process GSO segments. Try to do any work for the entire packet that - * doesn't involve actually writing to it before this point. */ - orig_len = 0; - do { - struct sk_buff *next_skb = skb->next; - skb->next = NULL; - - orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb); - - skb = next_skb; - } while (skb); - - return orig_len; - -error_free: - kfree_skb(skb); - vport_record_error(vport, VPORT_E_TX_ERROR); -error: - return 0; + return tnl_create(name, config, &gre_vport_ops, &gre_tnl_ops); } static struct net_protocol gre_protocol_handlers = { @@ -1153,232 +335,21 @@ static int gre_init(void) int err; err = inet_add_protocol(&gre_protocol_handlers, IPPROTO_GRE); - if (err) + if (err) { printk(KERN_WARNING "openvswitch: cannot register gre protocol handler\n"); - - return err; -} - -static void gre_exit(void) -{ - tbl_destroy(port_table, NULL); - inet_del_protocol(&gre_protocol_handlers, IPPROTO_GRE); -} - -static int set_config(const struct vport *cur_vport, - struct mutable_config *mutable, const void __user *uconfig) -{ - const struct vport *old_vport; - const struct mutable_config *old_mutable; - int port_type; - - if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct gre_port_config))) - return -EFAULT; - - if (mutable->port_config.daddr == 0) - return -EINVAL; - - if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH) { - port_type = FIND_PORT_MATCH; - mutable->port_config.in_key = 0; - } else - port_type = FIND_PORT_KEY; - - old_vport = find_port(mutable->port_config.saddr, - mutable->port_config.daddr, - mutable->port_config.in_key, port_type, - &old_mutable); - - if (old_vport && old_vport != cur_vport) - return -EEXIST; - - if (mutable->port_config.flags & GRE_F_OUT_KEY_ACTION) - mutable->port_config.out_key = 0; - - mutable->tunnel_hlen = sizeof(struct iphdr) + GRE_HEADER_SECTION; - - if (mutable->port_config.flags & GRE_F_CSUM) - mutable->tunnel_hlen += GRE_HEADER_SECTION; - - if (mutable->port_config.out_key || - mutable->port_config.flags & GRE_F_OUT_KEY_ACTION) - mutable->tunnel_hlen += GRE_HEADER_SECTION; - - return 0; -} - -static struct vport *gre_create(const char *name, const void __user *config) -{ - struct vport *vport; - struct gre_vport *gre_vport; - int err; - - vport = vport_alloc(sizeof(struct gre_vport), &gre_vport_ops); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } - - gre_vport = gre_vport_priv(vport); - - strcpy(gre_vport->name, name); - - gre_vport->mutable = kmalloc(sizeof(struct mutable_config), GFP_KERNEL); - if (!gre_vport->mutable) { - err = -ENOMEM; - goto error_free_vport; - } - - vport_gen_rand_ether_addr(gre_vport->mutable->eth_addr); - gre_vport->mutable->mtu = ETH_DATA_LEN; - - err = set_config(NULL, gre_vport->mutable, config); - if (err) - goto error_free_mutable; - - err = add_port(vport); - if (err) - goto error_free_mutable; - - return vport; - -error_free_mutable: - kfree(gre_vport->mutable); -error_free_vport: - vport_free(vport); -error: - return ERR_PTR(err); -} - -static int gre_modify(struct vport *vport, const void __user *config) -{ - struct gre_vport *gre_vport = gre_vport_priv(vport); - struct mutable_config *mutable; - int err; - int update_hash = 0; - - mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL); - if (!mutable) { - err = -ENOMEM; - goto error; - } - - err = set_config(vport, mutable, config); - if (err) - goto error_free; - - /* Only remove the port from the hash table if something that would - * affect the lookup has changed. */ - if (gre_vport->mutable->port_config.saddr != mutable->port_config.saddr || - gre_vport->mutable->port_config.daddr != mutable->port_config.daddr || - gre_vport->mutable->port_config.in_key != mutable->port_config.in_key || - (gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH) != - (mutable->port_config.flags & GRE_F_IN_KEY_MATCH)) - update_hash = 1; - - - /* This update is not atomic but the lookup uses the config, which - * serves as an inherent double check. */ - if (update_hash) { - err = del_port(vport); - if (err) - goto error_free; - } - - assign_config_rcu(vport, mutable); - - if (update_hash) { - err = add_port(vport); - if (err) - goto error_free; + goto out; } - return 0; + err = tnl_init(); -error_free: - kfree(mutable); -error: +out: return err; } -static void free_port(struct rcu_head *rcu) -{ - struct gre_vport *gre_vport = container_of(rcu, struct gre_vport, rcu); - - kfree(gre_vport->mutable); - vport_free(gre_vport_to_vport(gre_vport)); -} - -static int gre_destroy(struct vport *vport) -{ - struct gre_vport *gre_vport = gre_vport_priv(vport); - int port_type; - const struct mutable_config *old_mutable; - - /* Do a hash table lookup to make sure that the port exists. It should - * exist but might not if a modify failed earlier. */ - if (gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH) - port_type = FIND_PORT_MATCH; - else - port_type = FIND_PORT_KEY; - - if (vport == find_port(gre_vport->mutable->port_config.saddr, - gre_vport->mutable->port_config.daddr, - gre_vport->mutable->port_config.in_key, port_type, &old_mutable)) - del_port(vport); - - call_rcu(&gre_vport->rcu, free_port); - - return 0; -} - -static int gre_set_mtu(struct vport *vport, int mtu) -{ - struct gre_vport *gre_vport = gre_vport_priv(vport); - struct mutable_config *mutable; - - mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL); - if (!mutable) - return -ENOMEM; - - mutable->mtu = mtu; - assign_config_rcu(vport, mutable); - - return 0; -} - -static int gre_set_addr(struct vport *vport, const unsigned char *addr) -{ - struct gre_vport *gre_vport = gre_vport_priv(vport); - struct mutable_config *mutable; - - mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL); - if (!mutable) - return -ENOMEM; - - memcpy(mutable->eth_addr, addr, ETH_ALEN); - assign_config_rcu(vport, mutable); - - return 0; -} - - -static const char *gre_get_name(const struct vport *vport) -{ - const struct gre_vport *gre_vport = gre_vport_priv(vport); - return gre_vport->name; -} - -static const unsigned char *gre_get_addr(const struct vport *vport) -{ - const struct gre_vport *gre_vport = gre_vport_priv(vport); - return rcu_dereference(gre_vport->mutable)->eth_addr; -} - -static int gre_get_mtu(const struct vport *vport) +static void gre_exit(void) { - const struct gre_vport *gre_vport = gre_vport_priv(vport); - return rcu_dereference(gre_vport->mutable)->mtu; + tnl_exit(); + inet_del_protocol(&gre_protocol_handlers, IPPROTO_GRE); } struct vport_ops gre_vport_ops = { @@ -1387,15 +358,15 @@ struct vport_ops gre_vport_ops = { .init = gre_init, .exit = gre_exit, .create = gre_create, - .modify = gre_modify, - .destroy = gre_destroy, - .set_mtu = gre_set_mtu, - .set_addr = gre_set_addr, - .get_name = gre_get_name, - .get_addr = gre_get_addr, + .modify = tnl_modify, + .destroy = tnl_destroy, + .set_mtu = tnl_set_mtu, + .set_addr = tnl_set_addr, + .get_name = tnl_get_name, + .get_addr = tnl_get_addr, .get_dev_flags = vport_gen_get_dev_flags, .is_running = vport_gen_is_running, .get_operstate = vport_gen_get_operstate, - .get_mtu = gre_get_mtu, - .send = gre_send, + .get_mtu = tnl_get_mtu, + .send = tnl_send, }; diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk index 92e07188..f97c1b27 100644 --- a/include/openvswitch/automake.mk +++ b/include/openvswitch/automake.mk @@ -1,5 +1,5 @@ noinst_HEADERS += \ - include/openvswitch/gre.h \ include/openvswitch/brcompat-netlink.h \ - include/openvswitch/datapath-protocol.h + include/openvswitch/datapath-protocol.h \ + include/openvswitch/tunnel.h diff --git a/include/openvswitch/gre.h b/include/openvswitch/gre.h deleted file mode 100644 index a9ac1d98..00000000 --- a/include/openvswitch/gre.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2008, 2009, 2010 Nicira Networks. - * - * This file is offered under your choice of two licenses: Apache 2.0 or GNU - * GPL 2.0 or later. The permission statements for each of these licenses is - * given below. You may license your modifications to this file under either - * of these licenses or both. If you wish to license your modifications under - * only one of these licenses, delete the permission text for the other - * license. - * - * ---------------------------------------------------------------------- - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ---------------------------------------------------------------------- - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * ---------------------------------------------------------------------- - */ - -#ifndef OPENVSWITCH_GRE_H -#define OPENVSWITCH_GRE_H 1 - -#include - -#define GRE_F_CSUM (1 << 1) /* Checksum packets. */ -#define GRE_F_IN_KEY_MATCH (1 << 2) /* Store the key in tun_id to match in flow table. */ -#define GRE_F_OUT_KEY_ACTION (1 << 3) /* Get the key from a SET_TUNNEL action. */ -#define GRE_F_TOS_INHERIT (1 << 4) /* Inherit the ToS from the inner packet. */ -#define GRE_F_TTL_INHERIT (1 << 5) /* Inherit the TTL from the inner packet. */ -#define GRE_F_PMTUD (1 << 6) /* Enable path MTU discovery. */ - -struct gre_port_config { - __u32 flags; - __be32 saddr; - __be32 daddr; - __be32 in_key; - __be32 out_key; - __u8 tos; - __u8 ttl; -}; - -#endif /* openvswitch/gre.h */ diff --git a/include/openvswitch/tunnel.h b/include/openvswitch/tunnel.h new file mode 100644 index 00000000..37379751 --- /dev/null +++ b/include/openvswitch/tunnel.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2008, 2009, 2010 Nicira Networks. + * + * This file is offered under your choice of two licenses: Apache 2.0 or GNU + * GPL 2.0 or later. The permission statements for each of these licenses is + * given below. You may license your modifications to this file under either + * of these licenses or both. If you wish to license your modifications under + * only one of these licenses, delete the permission text for the other + * license. + * + * ---------------------------------------------------------------------- + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ---------------------------------------------------------------------- + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * ---------------------------------------------------------------------- + */ + +#ifndef OPENVSWITCH_TUNNEL_H +#define OPENVSWITCH_TUNNEL_H 1 + +#include + +#define TNL_F_CSUM (1 << 1) /* Checksum packets. */ +#define TNL_F_IN_KEY_MATCH (1 << 2) /* Store the key in tun_id to match in flow table. */ +#define TNL_F_OUT_KEY_ACTION (1 << 3) /* Get the key from a SET_TUNNEL action. */ +#define TNL_F_TOS_INHERIT (1 << 4) /* Inherit the ToS from the inner packet. */ +#define TNL_F_TTL_INHERIT (1 << 5) /* Inherit the TTL from the inner packet. */ +#define TNL_F_PMTUD (1 << 6) /* Enable path MTU discovery. */ + +struct tnl_port_config { + __u32 flags; + __be32 saddr; + __be32 daddr; + __be32 in_key; + __be32 out_key; + __u8 tos; + __u8 ttl; +}; + +#endif /* openvswitch/tunnel.h */ diff --git a/lib/netdev-gre.c b/lib/netdev-gre.c index 04a83327..9d9139fd 100644 --- a/lib/netdev-gre.c +++ b/lib/netdev-gre.c @@ -24,7 +24,7 @@ #include "netdev-vport.h" #include "openflow/openflow.h" #include "openvswitch/datapath-protocol.h" -#include "openvswitch/gre.h" +#include "openvswitch/tunnel.h" #include "packets.h" #include "socket-util.h" #include "vlog.h" @@ -55,13 +55,13 @@ netdev_gre_cast(const struct netdev *netdev) static int parse_config(const char *name, const struct shash *args, - struct gre_port_config *config) + struct tnl_port_config *config) { struct shash_node *node; memset(config, 0, sizeof *config); - config->flags |= GRE_F_PMTUD; + config->flags |= TNL_F_PMTUD; SHASH_FOR_EACH (node, args) { if (!strcmp(node->name, "remote_ip")) { @@ -80,42 +80,42 @@ parse_config(const char *name, const struct shash *args, } } else if (!strcmp(node->name, "key")) { if (!strcmp(node->data, "flow")) { - config->flags |= GRE_F_IN_KEY_MATCH; - config->flags |= GRE_F_OUT_KEY_ACTION; + config->flags |= TNL_F_IN_KEY_MATCH; + config->flags |= TNL_F_OUT_KEY_ACTION; } else { config->out_key = config->in_key = htonl(atoi(node->data)); } } else if (!strcmp(node->name, "in_key")) { if (!strcmp(node->data, "flow")) { - config->flags |= GRE_F_IN_KEY_MATCH; + config->flags |= TNL_F_IN_KEY_MATCH; } else { config->in_key = htonl(atoi(node->data)); } } else if (!strcmp(node->name, "out_key")) { if (!strcmp(node->data, "flow")) { - config->flags |= GRE_F_OUT_KEY_ACTION; + config->flags |= TNL_F_OUT_KEY_ACTION; } else { config->out_key = htonl(atoi(node->data)); } } else if (!strcmp(node->name, "tos")) { if (!strcmp(node->data, "inherit")) { - config->flags |= GRE_F_TOS_INHERIT; + config->flags |= TNL_F_TOS_INHERIT; } else { config->tos = atoi(node->data); } } else if (!strcmp(node->name, "ttl")) { if (!strcmp(node->data, "inherit")) { - config->flags |= GRE_F_TTL_INHERIT; + config->flags |= TNL_F_TTL_INHERIT; } else { config->ttl = atoi(node->data); } } else if (!strcmp(node->name, "csum")) { if (!strcmp(node->data, "true")) { - config->flags |= GRE_F_CSUM; + config->flags |= TNL_F_CSUM; } } else if (!strcmp(node->name, "pmtud")) { if (!strcmp(node->data, "false")) { - config->flags &= ~GRE_F_PMTUD; + config->flags &= ~TNL_F_PMTUD; } } else { VLOG_WARN("%s: unknown gre argument '%s'", name, node->name); @@ -136,7 +136,7 @@ netdev_gre_create(const char *name, const char *type OVS_UNUSED, { int err; struct odp_vport_add ova; - struct gre_port_config port_config; + struct tnl_port_config port_config; struct netdev_dev_gre *netdev_dev; ovs_strlcpy(ova.port_type, "gre", sizeof ova.port_type); @@ -176,7 +176,7 @@ netdev_gre_reconfigure(struct netdev_dev *netdev_dev_, const struct shash *args) { const char *name = netdev_dev_get_name(netdev_dev_); struct odp_vport_mod ovm; - struct gre_port_config port_config; + struct tnl_port_config port_config; int err; ovs_strlcpy(ovm.devname, name, sizeof ovm.devname); diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index a20a1c4d..f7d9cd3b 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -53,7 +53,6 @@ #include "netlink.h" #include "ofpbuf.h" #include "openflow/openflow.h" -#include "openvswitch/gre.h" #include "packets.h" #include "poll-loop.h" #include "port-array.h"