2 * Copyright (c) 2010 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 #include <linux/if_arp.h>
10 #include <linux/if_ether.h>
12 #include <linux/if_tunnel.h>
13 #include <linux/if_vlan.h>
15 #include <linux/in_route.h>
16 #include <linux/jhash.h>
17 #include <linux/kernel.h>
18 #include <linux/version.h>
20 #include <net/dsfield.h>
23 #include <net/inet_ecn.h>
25 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
28 #include <net/protocol.h>
29 #include <net/route.h>
34 #include "openvswitch/gre.h"
37 #include "vport-generic.h"
39 /* The absolute minimum fragment size. Note that there are many other
40 * definitions of the minimum MTU. */
43 /* The GRE header is composed of a series of sections: a base and then a variable
44 * number of options. */
45 #define GRE_HEADER_SECTION 4
47 struct mutable_config {
50 unsigned char eth_addr[ETH_ALEN];
52 struct gre_port_config port_config;
54 int tunnel_hlen; /* Tunnel header length. */
58 struct tbl_node tbl_node;
62 /* Protected by RCU. */
63 struct mutable_config *mutable;
66 /* Protected by RCU. */
67 static struct tbl *port_table;
69 /* These are just used as an optimization: they don't require any kind of
70 * synchronization because we could have just as easily read the value before
71 * the port change happened. */
72 static unsigned int key_local_remote_ports;
73 static unsigned int key_remote_ports;
74 static unsigned int local_remote_ports;
75 static unsigned int remote_ports;
77 static inline struct gre_vport *gre_vport_priv(const struct vport *vport)
79 return vport_priv(vport);
82 static inline struct vport *gre_vport_to_vport(const struct gre_vport *gre_vport)
84 return vport_from_priv(gre_vport);
87 static inline struct gre_vport *gre_vport_table_cast(const struct tbl_node *node)
89 return container_of(node, struct gre_vport, tbl_node);
93 static void free_config(struct rcu_head *rcu)
95 struct mutable_config *c = container_of(rcu, struct mutable_config, rcu);
99 static void assign_config_rcu(struct vport *vport,
100 struct mutable_config *new_config)
102 struct gre_vport *gre_vport = gre_vport_priv(vport);
103 struct mutable_config *old_config;
105 old_config = rcu_dereference(gre_vport->mutable);
106 rcu_assign_pointer(gre_vport->mutable, new_config);
107 call_rcu(&old_config->rcu, free_config);
110 static unsigned int *find_port_pool(const struct mutable_config *mutable)
112 if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH) {
113 if (mutable->port_config.saddr)
114 return &local_remote_ports;
116 return &remote_ports;
118 if (mutable->port_config.saddr)
119 return &key_local_remote_ports;
121 return &key_remote_ports;
132 struct port_lookup_key {
133 u32 vals[4]; /* Contains enum lookup_key keys. */
134 const struct mutable_config *mutable;
137 /* Modifies 'target' to store the rcu_dereferenced pointer that was used to do
138 * the comparision. */
139 static int port_cmp(const struct tbl_node *node, void *target)
141 const struct gre_vport *gre_vport = gre_vport_table_cast(node);
142 struct port_lookup_key *lookup = target;
144 lookup->mutable = rcu_dereference(gre_vport->mutable);
146 return ((lookup->mutable->port_config.flags & GRE_F_IN_KEY_MATCH) ==
147 lookup->vals[LOOKUP_KEY_MATCH]) &&
148 lookup->mutable->port_config.daddr == lookup->vals[LOOKUP_DADDR] &&
149 lookup->mutable->port_config.in_key == lookup->vals[LOOKUP_KEY] &&
150 lookup->mutable->port_config.saddr == lookup->vals[LOOKUP_SADDR];
153 static u32 port_hash(struct port_lookup_key *lookup)
155 return jhash2(lookup->vals, ARRAY_SIZE(lookup->vals), 0);
158 static int add_port(struct vport *vport)
160 struct gre_vport *gre_vport = gre_vport_priv(vport);
161 struct port_lookup_key lookup;
165 struct tbl *new_table;
167 new_table = tbl_create(0);
171 rcu_assign_pointer(port_table, new_table);
173 } else if (tbl_count(port_table) > tbl_n_buckets(port_table)) {
174 struct tbl *old_table = port_table;
175 struct tbl *new_table;
177 new_table = tbl_expand(old_table);
178 if (IS_ERR(new_table))
179 return PTR_ERR(new_table);
181 rcu_assign_pointer(port_table, new_table);
182 tbl_deferred_destroy(old_table, NULL);
185 lookup.vals[LOOKUP_SADDR] = gre_vport->mutable->port_config.saddr;
186 lookup.vals[LOOKUP_DADDR] = gre_vport->mutable->port_config.daddr;
187 lookup.vals[LOOKUP_KEY] = gre_vport->mutable->port_config.in_key;
188 lookup.vals[LOOKUP_KEY_MATCH] = gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH;
190 err = tbl_insert(port_table, &gre_vport->tbl_node, port_hash(&lookup));
194 (*find_port_pool(gre_vport->mutable))++;
199 static int del_port(struct vport *vport)
201 struct gre_vport *gre_vport = gre_vport_priv(vport);
204 err = tbl_remove(port_table, &gre_vport->tbl_node);
208 (*find_port_pool(gre_vport->mutable))--;
213 #define FIND_PORT_KEY (1 << 0)
214 #define FIND_PORT_MATCH (1 << 1)
215 #define FIND_PORT_ANY (FIND_PORT_KEY | FIND_PORT_MATCH)
217 static struct vport *find_port(__be32 saddr, __be32 daddr, __be32 key,
219 const struct mutable_config **mutable)
221 struct port_lookup_key lookup;
222 struct tbl *table = rcu_dereference(port_table);
223 struct tbl_node *tbl_node;
228 lookup.vals[LOOKUP_SADDR] = saddr;
229 lookup.vals[LOOKUP_DADDR] = daddr;
231 if (port_type & FIND_PORT_KEY) {
232 lookup.vals[LOOKUP_KEY] = key;
233 lookup.vals[LOOKUP_KEY_MATCH] = 0;
235 if (key_local_remote_ports) {
236 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
241 if (key_remote_ports) {
242 lookup.vals[LOOKUP_SADDR] = 0;
244 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
248 lookup.vals[LOOKUP_SADDR] = saddr;
252 if (port_type & FIND_PORT_MATCH) {
253 lookup.vals[LOOKUP_KEY] = 0;
254 lookup.vals[LOOKUP_KEY_MATCH] = GRE_F_IN_KEY_MATCH;
256 if (local_remote_ports) {
257 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
263 lookup.vals[LOOKUP_SADDR] = 0;
265 tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
274 *mutable = lookup.mutable;
275 return gre_vport_to_vport(gre_vport_table_cast(tbl_node));
278 static bool check_ipv4_address(__be32 addr)
280 if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr)
281 || ipv4_is_loopback(addr) || ipv4_is_zeronet(addr))
287 static bool ipv4_should_icmp(struct sk_buff *skb)
289 struct iphdr *old_iph = ip_hdr(skb);
291 /* Don't respond to L2 broadcast. */
292 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
295 /* Don't respond to L3 broadcast or invalid addresses. */
296 if (!check_ipv4_address(old_iph->daddr) ||
297 !check_ipv4_address(old_iph->saddr))
300 /* Only respond to the first fragment. */
301 if (old_iph->frag_off & htons(IP_OFFSET))
304 /* Don't respond to ICMP error messages. */
305 if (old_iph->protocol == IPPROTO_ICMP) {
306 u8 icmp_type, *icmp_typep;
308 icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
309 (old_iph->ihl << 2) +
310 offsetof(struct icmphdr, type) -
311 skb->data, sizeof(icmp_type),
317 if (*icmp_typep > NR_ICMP_TYPES
318 || (*icmp_typep <= ICMP_PARAMETERPROB
319 && *icmp_typep != ICMP_ECHOREPLY
320 && *icmp_typep != ICMP_ECHO))
327 static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
328 unsigned int mtu, unsigned int payload_length)
330 struct iphdr *iph, *old_iph = ip_hdr(skb);
331 struct icmphdr *icmph;
334 iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
335 icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
336 payload = skb_put(nskb, payload_length);
340 iph->ihl = sizeof(struct iphdr) >> 2;
341 iph->tos = (old_iph->tos & IPTOS_TOS_MASK) |
342 IPTOS_PREC_INTERNETCONTROL;
343 iph->tot_len = htons(sizeof(struct iphdr)
344 + sizeof(struct icmphdr)
346 get_random_bytes(&iph->id, sizeof(iph->id));
349 iph->protocol = IPPROTO_ICMP;
350 iph->daddr = old_iph->saddr;
351 iph->saddr = old_iph->daddr;
356 icmph->type = ICMP_DEST_UNREACH;
357 icmph->code = ICMP_FRAG_NEEDED;
358 icmph->un.gateway = htonl(mtu);
361 nskb->csum = csum_partial((u8 *)icmph, sizeof(struct icmphdr), 0);
362 nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
363 payload, payload_length,
365 icmph->checksum = csum_fold(nskb->csum);
368 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
369 static bool ipv6_should_icmp(struct sk_buff *skb)
371 struct ipv6hdr *old_ipv6h = ipv6_hdr(skb);
373 int payload_off = (u8 *)(old_ipv6h + 1) - skb->data;
374 u8 nexthdr = ipv6_hdr(skb)->nexthdr;
376 /* Check source address is valid. */
377 addr_type = ipv6_addr_type(&old_ipv6h->saddr);
378 if (addr_type & IPV6_ADDR_MULTICAST || addr_type == IPV6_ADDR_ANY)
381 /* Don't reply to unspecified addresses. */
382 if (ipv6_addr_type(&old_ipv6h->daddr) == IPV6_ADDR_ANY)
385 /* Don't respond to ICMP error messages. */
386 payload_off = ipv6_skip_exthdr(skb, payload_off, &nexthdr);
390 if (nexthdr == NEXTHDR_ICMP) {
391 u8 icmp_type, *icmp_typep;
393 icmp_typep = skb_header_pointer(skb, payload_off +
394 offsetof(struct icmp6hdr,
396 sizeof(icmp_type), &icmp_type);
398 if (!icmp_typep || !(*icmp_typep & ICMPV6_INFOMSG_MASK))
405 static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
406 unsigned int mtu, unsigned int payload_length)
408 struct ipv6hdr *ipv6h, *old_ipv6h = ipv6_hdr(skb);
409 struct icmp6hdr *icmp6h;
412 ipv6h = (struct ipv6hdr *)skb_put(nskb, sizeof(struct ipv6hdr));
413 icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
414 payload = skb_put(nskb, payload_length);
419 memset(&ipv6h->flow_lbl, 0, sizeof(ipv6h->flow_lbl));
420 ipv6h->payload_len = htons(sizeof(struct icmp6hdr)
422 ipv6h->nexthdr = NEXTHDR_ICMP;
423 ipv6h->hop_limit = IPV6_DEFAULT_HOPLIMIT;
424 ipv6_addr_copy(&ipv6h->daddr, &old_ipv6h->saddr);
425 ipv6_addr_copy(&ipv6h->saddr, &old_ipv6h->daddr);
428 icmp6h->icmp6_type = ICMPV6_PKT_TOOBIG;
429 icmp6h->icmp6_code = 0;
430 icmp6h->icmp6_cksum = 0;
431 icmp6h->icmp6_mtu = htonl(mtu);
433 nskb->csum = csum_partial((u8 *)icmp6h, sizeof(struct icmp6hdr), 0);
434 nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_ipv6h - skb->data,
435 payload, payload_length,
437 icmp6h->icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
438 sizeof(struct icmp6hdr)
440 ipv6h->nexthdr, nskb->csum);
444 static bool send_frag_needed(struct vport *vport,
445 const struct mutable_config *mutable,
446 struct sk_buff *skb, unsigned int mtu,
449 unsigned int eth_hdr_len = ETH_HLEN;
450 unsigned int total_length = 0, header_length = 0, payload_length;
451 struct ethhdr *eh, *old_eh = eth_hdr(skb);
452 struct sk_buff *nskb;
455 if (skb->protocol == htons(ETH_P_IP)) {
456 if (mtu < IP_MIN_MTU)
459 if (!ipv4_should_icmp(skb))
462 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
463 else if (skb->protocol == htons(ETH_P_IPV6)) {
464 if (mtu < IPV6_MIN_MTU)
467 /* In theory we should do PMTUD on IPv6 multicast messages but
468 * we don't have an address to send from so just fragment. */
469 if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
472 if (!ipv6_should_icmp(skb))
480 if (old_eh->h_proto == htons(ETH_P_8021Q))
481 eth_hdr_len = VLAN_ETH_HLEN;
483 payload_length = skb->len - eth_hdr_len;
484 if (skb->protocol == htons(ETH_P_IP)) {
485 header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
486 total_length = min_t(unsigned int, header_length +
487 payload_length, 576);
489 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
491 header_length = sizeof(struct ipv6hdr) +
492 sizeof(struct icmp6hdr);
493 total_length = min_t(unsigned int, header_length +
494 payload_length, IPV6_MIN_MTU);
498 total_length = min(total_length, mutable->mtu);
499 payload_length = total_length - header_length;
501 nskb = dev_alloc_skb(NET_IP_ALIGN + eth_hdr_len + header_length +
506 skb_reserve(nskb, NET_IP_ALIGN);
508 /* Ethernet / VLAN */
509 eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
510 memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
511 memcpy(eh->h_source, mutable->eth_addr, ETH_ALEN);
512 nskb->protocol = eh->h_proto = old_eh->h_proto;
513 if (old_eh->h_proto == htons(ETH_P_8021Q)) {
514 struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
516 vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
517 vh->h_vlan_encapsulated_proto = skb->protocol;
519 skb_reset_mac_header(nskb);
522 if (skb->protocol == htons(ETH_P_IP))
523 ipv4_build_icmp(skb, nskb, mtu, payload_length);
524 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
526 ipv6_build_icmp(skb, nskb, mtu, payload_length);
529 /* Assume that flow based keys are symmetric with respect to input
530 * and output and use the key that we were going to put on the
531 * outgoing packet for the fake received packet. If the keys are
532 * not symmetric then PMTUD needs to be disabled since we won't have
533 * any way of synthesizing packets. */
534 if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH &&
535 mutable->port_config.flags & GRE_F_OUT_KEY_ACTION)
536 OVS_CB(nskb)->tun_id = flow_key;
538 compute_ip_summed(nskb, false);
539 vport_receive(vport, nskb);
544 static struct sk_buff *check_headroom(struct sk_buff *skb, int headroom)
546 if (skb_headroom(skb) < headroom || skb_header_cloned(skb)) {
547 struct sk_buff *nskb = skb_realloc_headroom(skb, headroom + 16);
550 return ERR_PTR(-ENOMEM);
553 set_skb_csum_bits(skb, nskb);
556 skb_set_owner_w(nskb, skb->sk);
565 static void create_gre_header(struct sk_buff *skb,
566 const struct mutable_config *mutable)
568 struct iphdr *iph = ip_hdr(skb);
569 __be16 *flags = (__be16 *)(iph + 1);
570 __be16 *protocol = flags + 1;
571 __be32 *options = (__be32 *)((u8 *)iph + mutable->tunnel_hlen
572 - GRE_HEADER_SECTION);
574 *protocol = htons(ETH_P_TEB);
577 /* Work backwards over the options so the checksum is last. */
578 if (mutable->port_config.out_key ||
579 mutable->port_config.flags & GRE_F_OUT_KEY_ACTION) {
582 if (mutable->port_config.flags & GRE_F_OUT_KEY_ACTION)
583 *options = OVS_CB(skb)->tun_id;
585 *options = mutable->port_config.out_key;
590 if (mutable->port_config.flags & GRE_F_OUT_CSUM) {
594 *(__sum16 *)options = csum_fold(skb_checksum(skb,
595 sizeof(struct iphdr),
596 skb->len - sizeof(struct iphdr),
601 static int check_checksum(struct sk_buff *skb)
603 struct iphdr *iph = ip_hdr(skb);
604 __be16 flags = *(__be16 *)(iph + 1);
607 if (flags & GRE_CSUM) {
608 switch (skb->ip_summed) {
609 case CHECKSUM_COMPLETE:
610 csum = csum_fold(skb->csum);
618 csum = __skb_checksum_complete(skb);
619 skb->ip_summed = CHECKSUM_COMPLETE;
627 static int parse_gre_header(struct iphdr *iph, __be16 *flags, __be32 *key)
629 /* IP and ICMP protocol handlers check that the IHL is valid. */
630 __be16 *flagsp = (__be16 *)((u8 *)iph + (iph->ihl << 2));
631 __be16 *protocol = flagsp + 1;
632 __be32 *options = (__be32 *)(protocol + 1);
637 if (*flags & (GRE_VERSION | GRE_ROUTING))
640 if (*protocol != htons(ETH_P_TEB))
643 hdr_len = GRE_HEADER_SECTION;
645 if (*flags & GRE_CSUM) {
646 hdr_len += GRE_HEADER_SECTION;
650 if (*flags & GRE_KEY) {
651 hdr_len += GRE_HEADER_SECTION;
658 if (*flags & GRE_SEQ)
659 hdr_len += GRE_HEADER_SECTION;
664 static inline u8 ecn_encapsulate(u8 tos, struct sk_buff *skb)
668 if (skb->protocol == htons(ETH_P_IP))
669 inner = ((struct iphdr *)skb_network_header(skb))->tos;
670 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
671 else if (skb->protocol == htons(ETH_P_IPV6))
672 inner = ipv6_get_dsfield((struct ipv6hdr *)skb_network_header(skb));
677 return INET_ECN_encapsulate(tos, inner);
680 static inline void ecn_decapsulate(u8 tos, struct sk_buff *skb)
682 if (INET_ECN_is_ce(tos)) {
683 __be16 protocol = skb->protocol;
684 unsigned int nw_header = skb_network_header(skb) - skb->data;
686 if (skb->protocol == htons(ETH_P_8021Q)) {
687 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
690 protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
691 nw_header += VLAN_HLEN;
694 if (protocol == htons(ETH_P_IP)) {
695 if (unlikely(!pskb_may_pull(skb, nw_header
696 + sizeof(struct iphdr))))
699 IP_ECN_set_ce((struct iphdr *)(nw_header + skb->data));
701 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
702 else if (protocol == htons(ETH_P_IPV6)) {
703 if (unlikely(!pskb_may_pull(skb, nw_header
704 + sizeof(struct ipv6hdr))))
707 IP6_ECN_set_ce((struct ipv6hdr *)(nw_header
714 static struct sk_buff *handle_gso(struct sk_buff *skb)
716 if (skb_is_gso(skb)) {
717 struct sk_buff *nskb = skb_gso_segment(skb, 0);
726 static int handle_csum_offload(struct sk_buff *skb)
728 if (skb->ip_summed == CHECKSUM_PARTIAL)
729 return skb_checksum_help(skb);
731 skb->ip_summed = CHECKSUM_NONE;
736 /* Called with rcu_read_lock. */
737 static void gre_err(struct sk_buff *skb, u32 info)
740 const struct mutable_config *mutable;
741 const int type = icmp_hdr(skb)->type;
742 const int code = icmp_hdr(skb)->code;
743 int mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
748 int tunnel_hdr_len, tot_hdr_len;
749 unsigned int orig_mac_header;
750 unsigned int orig_nw_header;
752 if (type != ICMP_DEST_UNREACH || code != ICMP_FRAG_NEEDED)
755 /* The mimimum size packet that we would actually be able to process:
756 * encapsulating IP header, minimum GRE header, Ethernet header,
757 * inner IPv4 header. */
758 if (!pskb_may_pull(skb, sizeof(struct iphdr) + GRE_HEADER_SECTION +
759 ETH_HLEN + sizeof(struct iphdr)))
762 iph = (struct iphdr *)skb->data;
764 tunnel_hdr_len = parse_gre_header(iph, &flags, &key);
765 if (tunnel_hdr_len < 0)
768 vport = find_port(iph->saddr, iph->daddr, key, FIND_PORT_ANY, &mutable);
772 /* Packets received by this function were previously sent by us, so
773 * any comparisons should be to the output values, not the input.
774 * However, it's not really worth it to have a hash table based on
775 * output keys (especially since ICMP error handling of tunneled packets
776 * isn't that reliable anyways). Therefore, we do a lookup based on the
777 * out key as if it were the in key and then check to see if the input
778 * and output keys are the same. */
779 if (mutable->port_config.in_key != mutable->port_config.out_key)
782 if (!!(mutable->port_config.flags & GRE_F_IN_KEY_MATCH) !=
783 !!(mutable->port_config.flags & GRE_F_OUT_KEY_ACTION))
786 if ((mutable->port_config.flags & GRE_F_OUT_CSUM) && !(flags & GRE_CSUM))
789 tunnel_hdr_len += iph->ihl << 2;
791 orig_mac_header = skb_mac_header(skb) - skb->data;
792 orig_nw_header = skb_network_header(skb) - skb->data;
793 skb_set_mac_header(skb, tunnel_hdr_len);
795 tot_hdr_len = tunnel_hdr_len + ETH_HLEN;
797 skb->protocol = eth_hdr(skb)->h_proto;
798 if (skb->protocol == htons(ETH_P_8021Q)) {
799 tot_hdr_len += VLAN_HLEN;
800 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
803 skb_set_network_header(skb, tot_hdr_len);
806 if (skb->protocol == htons(ETH_P_IP))
807 tot_hdr_len += sizeof(struct iphdr);
808 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
809 else if (skb->protocol == htons(ETH_P_IPV6))
810 tot_hdr_len += sizeof(struct ipv6hdr);
815 if (!pskb_may_pull(skb, tot_hdr_len))
818 if (skb->protocol == htons(ETH_P_IP)) {
819 if (mtu < IP_MIN_MTU) {
820 if (ntohs(ip_hdr(skb)->tot_len) >= IP_MIN_MTU)
827 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
828 else if (skb->protocol == htons(ETH_P_IPV6)) {
829 if (mtu < IPV6_MIN_MTU) {
830 unsigned int packet_length = sizeof(struct ipv6hdr) +
831 ntohs(ipv6_hdr(skb)->payload_len);
833 if (packet_length >= IPV6_MIN_MTU
834 || ntohs(ipv6_hdr(skb)->payload_len) == 0)
842 __pskb_pull(skb, tunnel_hdr_len);
843 send_frag_needed(vport, mutable, skb, mtu, key);
844 skb_push(skb, tunnel_hdr_len);
847 skb_set_mac_header(skb, orig_mac_header);
848 skb_set_network_header(skb, orig_nw_header);
849 skb->protocol = htons(ETH_P_IP);
852 /* Called with rcu_read_lock. */
853 static int gre_rcv(struct sk_buff *skb)
856 const struct mutable_config *mutable;
862 if (!pskb_may_pull(skb, GRE_HEADER_SECTION + ETH_HLEN))
865 if (!check_checksum(skb))
870 hdr_len = parse_gre_header(iph, &flags, &key);
874 vport = find_port(iph->daddr, iph->saddr, key, FIND_PORT_ANY, &mutable);
876 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
880 if ((mutable->port_config.flags & GRE_F_IN_CSUM) && !(flags & GRE_CSUM)) {
881 vport_record_error(vport, VPORT_E_RX_CRC);
885 if (!pskb_pull(skb, hdr_len) || !pskb_may_pull(skb, ETH_HLEN)) {
886 vport_record_error(vport, VPORT_E_RX_ERROR);
890 skb->pkt_type = PACKET_HOST;
891 skb->protocol = eth_type_trans(skb, skb->dev);
892 skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN);
897 skb_reset_network_header(skb);
899 ecn_decapsulate(iph->tos, skb);
901 if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH)
902 OVS_CB(skb)->tun_id = key;
904 OVS_CB(skb)->tun_id = 0;
906 skb_push(skb, ETH_HLEN);
907 compute_ip_summed(skb, false);
909 vport_receive(vport, skb);
918 static int build_packet(struct vport *vport, const struct mutable_config *mutable,
919 struct iphdr *iph, struct rtable *rt, int max_headroom,
920 int mtu, struct sk_buff *skb)
923 struct iphdr *new_iph;
924 int orig_len = skb->len;
925 __be16 frag_off = iph->frag_off;
927 skb = check_headroom(skb, max_headroom);
928 if (unlikely(IS_ERR(skb)))
931 err = handle_csum_offload(skb);
935 if (skb->protocol == htons(ETH_P_IP)) {
936 struct iphdr *old_iph = ip_hdr(skb);
938 if ((old_iph->frag_off & htons(IP_DF)) &&
939 mtu < ntohs(old_iph->tot_len)) {
940 if (send_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
945 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
946 else if (skb->protocol == htons(ETH_P_IPV6)) {
947 unsigned int packet_length = skb->len - ETH_HLEN
948 - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
950 /* IPv6 requires PMTUD if the packet is above the minimum MTU. */
951 if (packet_length > IPV6_MIN_MTU)
952 frag_off = htons(IP_DF);
954 if (mtu < packet_length) {
955 if (send_frag_needed(vport, mutable, skb, mtu, OVS_CB(skb)->tun_id))
961 skb_reset_transport_header(skb);
962 new_iph = (struct iphdr *)skb_push(skb, mutable->tunnel_hlen);
963 skb_reset_network_header(skb);
965 memcpy(new_iph, iph, sizeof(struct iphdr));
966 new_iph->frag_off = frag_off;
967 ip_select_ident(new_iph, &rt->u.dst, NULL);
969 create_gre_header(skb, mutable);
971 /* Allow our local IP stack to fragment the outer packet even if the
972 * DF bit is set as a last resort. */
975 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
976 IPCB(skb)->flags = 0;
978 err = ip_local_out(skb);
979 if (likely(net_xmit_eval(err) == 0))
982 vport_record_error(vport, VPORT_E_TX_ERROR);
989 vport_record_error(vport, VPORT_E_TX_DROPPED);
994 static int gre_send(struct vport *vport, struct sk_buff *skb)
996 struct gre_vport *gre_vport = gre_vport_priv(vport);
997 const struct mutable_config *mutable = rcu_dereference(gre_vport->mutable);
999 struct iphdr *old_iph;
1006 /* Validate the protocol headers before we try to use them. */
1007 if (skb->protocol == htons(ETH_P_8021Q)) {
1008 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
1011 skb->protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
1012 skb_set_network_header(skb, VLAN_ETH_HLEN);
1015 if (skb->protocol == htons(ETH_P_IP)) {
1016 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
1017 + sizeof(struct iphdr) - skb->data)))
1020 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1021 else if (skb->protocol == htons(ETH_P_IPV6)) {
1022 if (unlikely(!pskb_may_pull(skb, skb_network_header(skb)
1023 + sizeof(struct ipv6hdr) - skb->data)))
1027 old_iph = ip_hdr(skb);
1029 iph.tos = mutable->port_config.tos;
1030 if (mutable->port_config.flags & GRE_F_TOS_INHERIT) {
1031 if (skb->protocol == htons(ETH_P_IP))
1032 iph.tos = old_iph->tos;
1033 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1034 else if (skb->protocol == htons(ETH_P_IPV6))
1035 iph.tos = ipv6_get_dsfield(ipv6_hdr(skb));
1038 iph.tos = ecn_encapsulate(iph.tos, skb);
1041 struct flowi fl = { .nl_u = { .ip4_u =
1042 { .daddr = mutable->port_config.daddr,
1043 .saddr = mutable->port_config.saddr,
1044 .tos = RT_TOS(iph.tos) } },
1045 .proto = IPPROTO_GRE };
1047 if (ip_route_output_key(&init_net, &rt, &fl))
1051 iph.ttl = mutable->port_config.ttl;
1052 if (mutable->port_config.flags & GRE_F_TTL_INHERIT) {
1053 if (skb->protocol == htons(ETH_P_IP))
1054 iph.ttl = old_iph->ttl;
1055 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1056 else if (skb->protocol == htons(ETH_P_IPV6))
1057 iph.ttl = ipv6_hdr(skb)->hop_limit;
1061 iph.ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
1063 iph.frag_off = (mutable->port_config.flags & GRE_F_PMTUD) ? htons(IP_DF) : 0;
1065 mtu = dst_mtu(&rt->u.dst)
1067 - mutable->tunnel_hlen
1068 - (eth_hdr(skb)->h_proto == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
1072 if (skb->protocol == htons(ETH_P_IP)) {
1073 iph.frag_off |= old_iph->frag_off & htons(IP_DF);
1074 mtu = max(mtu, IP_MIN_MTU);
1076 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1077 else if (skb->protocol == htons(ETH_P_IPV6))
1078 mtu = max(mtu, IPV6_MIN_MTU);
1082 iph.ihl = sizeof(struct iphdr) >> 2;
1083 iph.protocol = IPPROTO_GRE;
1084 iph.daddr = rt->rt_dst;
1085 iph.saddr = rt->rt_src;
1090 skb_dst_set(skb, &rt->u.dst);
1092 /* If we are doing GSO on a pskb it is better to make sure that the
1093 * headroom is correct now. We will only have to copy the portion in
1094 * the linear data area and GSO will preserve headroom when it creates
1095 * the segments. This is particularly beneficial on Xen where we get
1096 * lots of GSO pskbs. Conversely, we delay copying if it is just to
1097 * get our own writable clone because GSO may do the copy for us. */
1098 max_headroom = LL_RESERVED_SPACE(rt->u.dst.dev) + rt->u.dst.header_len
1099 + mutable->tunnel_hlen;
1101 if (skb_headroom(skb) < max_headroom) {
1102 skb = check_headroom(skb, max_headroom);
1103 if (unlikely(IS_ERR(skb))) {
1104 vport_record_error(vport, VPORT_E_TX_DROPPED);
1109 forward_ip_summed(skb);
1111 if (unlikely(vswitch_skb_checksum_setup(skb)))
1114 skb = handle_gso(skb);
1115 if (unlikely(IS_ERR(skb))) {
1116 vport_record_error(vport, VPORT_E_TX_DROPPED);
1120 /* Process GSO segments. Try to do any work for the entire packet that
1121 * doesn't involve actually writing to it before this point. */
1124 struct sk_buff *next_skb = skb->next;
1127 orig_len += build_packet(vport, mutable, &iph, rt, max_headroom, mtu, skb);
1136 vport_record_error(vport, VPORT_E_TX_ERROR);
1141 static struct net_protocol gre_protocol_handlers = {
1143 .err_handler = gre_err,
1146 static int gre_init(void)
1150 err = inet_add_protocol(&gre_protocol_handlers, IPPROTO_GRE);
1152 printk(KERN_WARNING "openvswitch: cannot register gre protocol handler\n");
1157 static void gre_exit(void)
1159 tbl_destroy(port_table, NULL);
1160 inet_del_protocol(&gre_protocol_handlers, IPPROTO_GRE);
1163 static int set_config(const struct vport *cur_vport,
1164 struct mutable_config *mutable, const void __user *uconfig)
1166 const struct vport *old_vport;
1167 const struct mutable_config *old_mutable;
1170 if (copy_from_user(&mutable->port_config, uconfig, sizeof(struct gre_port_config)))
1173 if (mutable->port_config.daddr == 0)
1176 if (mutable->port_config.flags & GRE_F_IN_KEY_MATCH) {
1177 port_type = FIND_PORT_MATCH;
1178 mutable->port_config.in_key = 0;
1180 port_type = FIND_PORT_KEY;
1182 old_vport = find_port(mutable->port_config.saddr,
1183 mutable->port_config.daddr,
1184 mutable->port_config.in_key, port_type,
1187 if (old_vport && old_vport != cur_vport)
1190 if (mutable->port_config.flags & GRE_F_OUT_KEY_ACTION)
1191 mutable->port_config.out_key = 0;
1193 mutable->tunnel_hlen = sizeof(struct iphdr) + GRE_HEADER_SECTION;
1195 if (mutable->port_config.flags & GRE_F_OUT_CSUM)
1196 mutable->tunnel_hlen += GRE_HEADER_SECTION;
1198 if (mutable->port_config.out_key ||
1199 mutable->port_config.flags & GRE_F_OUT_KEY_ACTION)
1200 mutable->tunnel_hlen += GRE_HEADER_SECTION;
1205 static struct vport *gre_create(const char *name, const void __user *config)
1207 struct vport *vport;
1208 struct gre_vport *gre_vport;
1211 vport = vport_alloc(sizeof(struct gre_vport), &gre_vport_ops);
1212 if (IS_ERR(vport)) {
1213 err = PTR_ERR(vport);
1217 gre_vport = gre_vport_priv(vport);
1219 strcpy(gre_vport->name, name);
1221 gre_vport->mutable = kmalloc(sizeof(struct mutable_config), GFP_KERNEL);
1222 if (!gre_vport->mutable) {
1224 goto error_free_vport;
1227 vport_gen_rand_ether_addr(gre_vport->mutable->eth_addr);
1228 gre_vport->mutable->mtu = ETH_DATA_LEN;
1230 err = set_config(NULL, gre_vport->mutable, config);
1232 goto error_free_mutable;
1234 err = add_port(vport);
1236 goto error_free_mutable;
1241 kfree(gre_vport->mutable);
1245 return ERR_PTR(err);
1248 static int gre_modify(struct vport *vport, const void __user *config)
1250 struct gre_vport *gre_vport = gre_vport_priv(vport);
1251 struct mutable_config *mutable;
1253 int update_hash = 0;
1255 mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL);
1261 err = set_config(vport, mutable, config);
1265 /* Only remove the port from the hash table if something that would
1266 * affect the lookup has changed. */
1267 if (gre_vport->mutable->port_config.saddr != mutable->port_config.saddr ||
1268 gre_vport->mutable->port_config.daddr != mutable->port_config.daddr ||
1269 gre_vport->mutable->port_config.in_key != mutable->port_config.in_key ||
1270 (gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH) !=
1271 (mutable->port_config.flags & GRE_F_IN_KEY_MATCH))
1275 /* This update is not atomic but the lookup uses the config, which
1276 * serves as an inherent double check. */
1278 err = del_port(vport);
1283 assign_config_rcu(vport, mutable);
1286 err = add_port(vport);
1299 static int gre_destroy(struct vport *vport)
1301 struct gre_vport *gre_vport = gre_vport_priv(vport);
1303 const struct mutable_config *old_mutable;
1305 /* Do a hash table lookup to make sure that the port exists. It should
1306 * exist but might not if a modify failed earlier. */
1307 if (gre_vport->mutable->port_config.flags & GRE_F_IN_KEY_MATCH)
1308 port_type = FIND_PORT_MATCH;
1310 port_type = FIND_PORT_KEY;
1312 if (vport == find_port(gre_vport->mutable->port_config.saddr,
1313 gre_vport->mutable->port_config.daddr,
1314 gre_vport->mutable->port_config.in_key, port_type, &old_mutable))
1317 kfree(gre_vport->mutable);
1323 static int gre_set_mtu(struct vport *vport, int mtu)
1325 struct gre_vport *gre_vport = gre_vport_priv(vport);
1326 struct mutable_config *mutable;
1328 mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL);
1333 assign_config_rcu(vport, mutable);
1338 static int gre_set_addr(struct vport *vport, const unsigned char *addr)
1340 struct gre_vport *gre_vport = gre_vport_priv(vport);
1341 struct mutable_config *mutable;
1343 mutable = kmemdup(gre_vport->mutable, sizeof(struct mutable_config), GFP_KERNEL);
1347 memcpy(mutable->eth_addr, addr, ETH_ALEN);
1348 assign_config_rcu(vport, mutable);
1354 static const char *gre_get_name(const struct vport *vport)
1356 const struct gre_vport *gre_vport = gre_vport_priv(vport);
1357 return gre_vport->name;
1360 static const unsigned char *gre_get_addr(const struct vport *vport)
1362 const struct gre_vport *gre_vport = gre_vport_priv(vport);
1363 return rcu_dereference(gre_vport->mutable)->eth_addr;
1366 static int gre_get_mtu(const struct vport *vport)
1368 const struct gre_vport *gre_vport = gre_vport_priv(vport);
1369 return rcu_dereference(gre_vport->mutable)->mtu;
1372 struct vport_ops gre_vport_ops = {
1374 .flags = VPORT_F_GEN_STATS | VPORT_F_TUN_ID,
1377 .create = gre_create,
1378 .modify = gre_modify,
1379 .destroy = gre_destroy,
1380 .set_mtu = gre_set_mtu,
1381 .set_addr = gre_set_addr,
1382 .get_name = gre_get_name,
1383 .get_addr = gre_get_addr,
1384 .get_dev_flags = vport_gen_get_dev_flags,
1385 .is_running = vport_gen_is_running,
1386 .get_operstate = vport_gen_get_operstate,
1387 .get_mtu = gre_get_mtu,