gre: Add kernel GRE support.
[openvswitch] / datapath / linux-2.6 / compat-2.6 / ip_gre.c
1 /* ip_gre driver port to Linux 2.6.18 and greater */
2
3 #include <linux/version.h>
4 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
5 #define HAVE_NETDEV_STATS
6 #endif
7 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
8 #define HAVE_NETDEV_HEADER_OPS
9 #endif
10 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
11 #define HAVE_NETDEV_NEEDED_HEADROOM
12 #endif
13
14 /*
15  *      Linux NET3:     GRE over IP protocol decoder.
16  *
17  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
18  *
19  *      This program is free software; you can redistribute it and/or
20  *      modify it under the terms of the GNU General Public License
21  *      as published by the Free Software Foundation; either version
22  *      2 of the License, or (at your option) any later version.
23  *
24  */
25
26 #include <linux/capability.h>
27 #include <linux/module.h>
28 #include <linux/types.h>
29 #include <linux/kernel.h>
30 #include <asm/uaccess.h>
31 #include <linux/skbuff.h>
32 #include <linux/netdevice.h>
33 #include <linux/in.h>
34 #include <linux/tcp.h>
35 #include <linux/udp.h>
36 #include <linux/if_arp.h>
37 #include <linux/mroute.h>
38 #include <linux/init.h>
39 #include <linux/in6.h>
40 #include <linux/inetdevice.h>
41 #include <linux/igmp.h>
42 #include <linux/netfilter_ipv4.h>
43 #include <linux/etherdevice.h>
44 #include <linux/if_ether.h>
45
46 #include <net/sock.h>
47 #include <net/ip.h>
48 #include <net/icmp.h>
49 #include <net/protocol.h>
50 #include <net/ipip.h>
51 #include <net/arp.h>
52 #include <net/checksum.h>
53 #include <net/dsfield.h>
54 #include <net/inet_ecn.h>
55 #include <net/xfrm.h>
56 #include <net/net_namespace.h>
57 #include <net/netns/generic.h>
58
59 #ifdef CONFIG_IPV6
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64
65 #include "compat.h"
66 #include "openvswitch/gre.h"
67
68 #ifndef GRE_IOCTL_ONLY
69 #include <net/rtnetlink.h>
70 #endif
71
72 /*
73    Problems & solutions
74    --------------------
75
76    1. The most important issue is detecting local dead loops.
77    They would cause complete host lockup in transmit, which
78    would be "resolved" by stack overflow or, if queueing is enabled,
79    with infinite looping in net_bh.
80
81    We cannot track such dead loops during route installation,
82    it is infeasible task. The most general solutions would be
83    to keep skb->encapsulation counter (sort of local ttl),
84    and silently drop packet when it expires. It is the best
85    solution, but it supposes maintaing new variable in ALL
86    skb, even if no tunneling is used.
87
88    Current solution: HARD_TX_LOCK lock breaks dead loops.
89
90
91
92    2. Networking dead loops would not kill routers, but would really
93    kill network. IP hop limit plays role of "t->recursion" in this case,
94    if we copy it from packet being encapsulated to upper header.
95    It is very good solution, but it introduces two problems:
96
97    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
98      do not work over tunnels.
99    - traceroute does not work. I planned to relay ICMP from tunnel,
100      so that this problem would be solved and traceroute output
101      would even more informative. This idea appeared to be wrong:
102      only Linux complies to rfc1812 now (yes, guys, Linux is the only
103      true router now :-)), all routers (at least, in neighbourhood of mine)
104      return only 8 bytes of payload. It is the end.
105
106    Hence, if we want that OSPF worked or traceroute said something reasonable,
107    we should search for another solution.
108
109    One of them is to parse packet trying to detect inner encapsulation
110    made by our node. It is difficult or even impossible, especially,
111    taking into account fragmentation. TO be short, tt is not solution at all.
112
113    Current solution: The solution was UNEXPECTEDLY SIMPLE.
114    We force DF flag on tunnels with preconfigured hop limit,
115    that is ALL. :-) Well, it does not remove the problem completely,
116    but exponential growth of network traffic is changed to linear
117    (branches, that exceed pmtu are pruned) and tunnel mtu
118    fastly degrades to value <68, where looping stops.
119    Yes, it is not good if there exists a router in the loop,
120    which does not force DF, even when encapsulating packets have DF set.
121    But it is not our problem! Nobody could accuse us, we made
122    all that we could make. Even if it is your gated who injected
123    fatal route to network, even if it were you who configured
124    fatal static route: you are innocent. :-)
125
126
127
128    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
129    practically identical code. It would be good to glue them
130    together, but it is not very evident, how to make them modular.
131    sit is integral part of IPv6, ipip and gre are naturally modular.
132    We could extract common parts (hash table, ioctl etc)
133    to a separate module (ip_tunnel.c).
134
135    Alexey Kuznetsov.
136  */
137
138 #ifndef GRE_IOCTL_ONLY
139 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
140 static struct rtnl_link_ops ipgre_tap_ops __read_mostly;
141 #endif
142 static int ipgre_tunnel_init(struct net_device *dev);
143 static void ipgre_tunnel_setup(struct net_device *dev);
144 static void ipgre_tap_setup(struct net_device *dev);
145 static int ipgre_tunnel_bind_dev(struct net_device *dev);
146
147 #define HASH_SIZE  16
148
149 static int ipgre_net_id;
150 struct ipgre_net {
151         struct ip_tunnel *tunnels[4][HASH_SIZE];
152
153         struct net_device *fb_tunnel_dev;
154 };
155
156 /* Tunnel hash table */
157
158 /*
159    4 hash tables:
160
161    3: (remote,local)
162    2: (remote,*)
163    1: (*,local)
164    0: (*,*)
165
166    We require exact key match i.e. if a key is present in packet
167    it will match only tunnel with the same key; if it is not present,
168    it will match only keyless tunnel.
169
170    All keysless packets, if not matched configured keyless tunnels
171    will match fallback tunnel.
172  */
173
174 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
175
176 #define tunnels_r_l     tunnels[3]
177 #define tunnels_r       tunnels[2]
178 #define tunnels_l       tunnels[1]
179 #define tunnels_wc      tunnels[0]
180
181 static DEFINE_RWLOCK(ipgre_lock);
182
183 /* Given src, dst and key, find appropriate for input tunnel. */
184
185 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
186                                               __be32 remote, __be32 local,
187                                               __be32 key, __be16 gre_proto)
188 {
189         struct net *net = dev_net(dev);
190         int link = dev->ifindex;
191         unsigned h0 = HASH(remote);
192         unsigned h1 = HASH(key);
193         struct ip_tunnel *t, *cand = NULL;
194         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
195         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
196                        ARPHRD_ETHER : ARPHRD_IPGRE;
197         int score, cand_score = 4;
198
199         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
200                 if (local != t->parms.iph.saddr ||
201                     remote != t->parms.iph.daddr ||
202                     key != t->parms.i_key ||
203                     !(t->dev->flags & IFF_UP))
204                         continue;
205
206                 if (t->dev->type != ARPHRD_IPGRE &&
207                     t->dev->type != dev_type)
208                         continue;
209
210                 score = 0;
211                 if (t->parms.link != link)
212                         score |= 1;
213                 if (t->dev->type != dev_type)
214                         score |= 2;
215                 if (score == 0)
216                         return t;
217
218                 if (score < cand_score) {
219                         cand = t;
220                         cand_score = score;
221                 }
222         }
223
224         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
225                 if (remote != t->parms.iph.daddr ||
226                     key != t->parms.i_key ||
227                     !(t->dev->flags & IFF_UP))
228                         continue;
229
230                 if (t->dev->type != ARPHRD_IPGRE &&
231                     t->dev->type != dev_type)
232                         continue;
233
234                 score = 0;
235                 if (t->parms.link != link)
236                         score |= 1;
237                 if (t->dev->type != dev_type)
238                         score |= 2;
239                 if (score == 0)
240                         return t;
241
242                 if (score < cand_score) {
243                         cand = t;
244                         cand_score = score;
245                 }
246         }
247
248         for (t = ign->tunnels_l[h1]; t; t = t->next) {
249                 if ((local != t->parms.iph.saddr &&
250                      (local != t->parms.iph.daddr ||
251                       !ipv4_is_multicast(local))) ||
252                     key != t->parms.i_key ||
253                     !(t->dev->flags & IFF_UP))
254                         continue;
255
256                 if (t->dev->type != ARPHRD_IPGRE &&
257                     t->dev->type != dev_type)
258                         continue;
259
260                 score = 0;
261                 if (t->parms.link != link)
262                         score |= 1;
263                 if (t->dev->type != dev_type)
264                         score |= 2;
265                 if (score == 0)
266                         return t;
267
268                 if (score < cand_score) {
269                         cand = t;
270                         cand_score = score;
271                 }
272         }
273
274         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
275                 if (t->parms.i_key != key ||
276                     !(t->dev->flags & IFF_UP))
277                         continue;
278
279                 if (t->dev->type != ARPHRD_IPGRE &&
280                     t->dev->type != dev_type)
281                         continue;
282
283                 score = 0;
284                 if (t->parms.link != link)
285                         score |= 1;
286                 if (t->dev->type != dev_type)
287                         score |= 2;
288                 if (score == 0)
289                         return t;
290
291                 if (score < cand_score) {
292                         cand = t;
293                         cand_score = score;
294                 }
295         }
296
297         if (cand != NULL)
298                 return cand;
299
300         if (ign->fb_tunnel_dev->flags & IFF_UP)
301                 return netdev_priv(ign->fb_tunnel_dev);
302
303         return NULL;
304 }
305
306 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
307                 struct ip_tunnel_parm *parms)
308 {
309         __be32 remote = parms->iph.daddr;
310         __be32 local = parms->iph.saddr;
311         __be32 key = parms->i_key;
312         unsigned h = HASH(key);
313         int prio = 0;
314
315         if (local)
316                 prio |= 1;
317         if (remote && !ipv4_is_multicast(remote)) {
318                 prio |= 2;
319                 h ^= HASH(remote);
320         }
321
322         return &ign->tunnels[prio][h];
323 }
324
325 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
326                 struct ip_tunnel *t)
327 {
328         return __ipgre_bucket(ign, &t->parms);
329 }
330
331 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
332 {
333         struct ip_tunnel **tp = ipgre_bucket(ign, t);
334
335         t->next = *tp;
336         write_lock_bh(&ipgre_lock);
337         *tp = t;
338         write_unlock_bh(&ipgre_lock);
339 }
340
341 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
342 {
343         struct ip_tunnel **tp;
344
345         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
346                 if (t == *tp) {
347                         write_lock_bh(&ipgre_lock);
348                         *tp = t->next;
349                         write_unlock_bh(&ipgre_lock);
350                         break;
351                 }
352         }
353 }
354
355 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
356                                            struct ip_tunnel_parm *parms,
357                                            int type)
358 {
359         __be32 remote = parms->iph.daddr;
360         __be32 local = parms->iph.saddr;
361         __be32 key = parms->i_key;
362         int link = parms->link;
363         struct ip_tunnel *t, **tp;
364         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
365
366         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
367                 if (local == t->parms.iph.saddr &&
368                     remote == t->parms.iph.daddr &&
369                     key == t->parms.i_key &&
370                     link == t->parms.link &&
371                     type == t->dev->type)
372                         break;
373
374         return t;
375 }
376
377 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
378                 struct ip_tunnel_parm *parms, int gretap, int create)
379 {
380         struct ip_tunnel *t, *nt;
381         struct net_device *dev;
382         char name[IFNAMSIZ];
383         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
384
385         t = ipgre_tunnel_find(net, parms, gretap ? ARPHRD_ETHER : ARPHRD_IPGRE);
386         if (t || !create)
387                 return t;
388
389         if (parms->name[0])
390                 strlcpy(name, parms->name, IFNAMSIZ);
391         else
392                 sprintf(name, "gre%%d");
393
394         dev = alloc_netdev(sizeof(*t), name, gretap ? ipgre_tap_setup
395                                                     : ipgre_tunnel_setup);
396         if (!dev)
397           return NULL;
398
399         dev_net_set(dev, net);
400
401         if (strchr(name, '%')) {
402                 if (dev_alloc_name(dev, name) < 0)
403                         goto failed_free;
404         }
405
406         if (gretap)
407                 random_ether_addr(dev->dev_addr);
408
409 #ifndef GRE_IOCTL_ONLY
410         dev->rtnl_link_ops = gretap ? &ipgre_tap_ops : &ipgre_link_ops;
411 #endif
412         nt = netdev_priv(dev);
413         nt->parms = *parms;
414
415         dev->mtu = ipgre_tunnel_bind_dev(dev);
416
417         if (register_netdevice(dev) < 0)
418                 goto failed_free;
419
420         dev_hold(dev);
421         ipgre_tunnel_link(ign, nt);
422         return nt;
423
424 failed_free:
425         free_netdev(dev);
426         return NULL;
427 }
428
429 static void ipgre_tunnel_uninit(struct net_device *dev)
430 {
431         struct net *net = dev_net(dev);
432         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
433
434         ipgre_tunnel_unlink(ign, netdev_priv(dev));
435         dev_put(dev);
436 }
437
438
439 static void ipgre_err(struct sk_buff *skb, u32 info)
440 {
441
442 /* All the routers (except for Linux) return only
443    8 bytes of packet payload. It means, that precise relaying of
444    ICMP in the real Internet is absolutely infeasible.
445
446    Moreover, Cisco "wise men" put GRE key to the third word
447    in GRE header. It makes impossible maintaining even soft state for keyed
448    GRE tunnels with enabled checksum. Tell them "thank you".
449
450    Well, I wonder, rfc1812 was written by Cisco employee,
451    what the hell these idiots break standrads established
452    by themself???
453  */
454
455         struct iphdr *iph = (struct iphdr *)skb->data;
456         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
457         int grehlen = (iph->ihl<<2) + 4;
458         const int type = icmp_hdr(skb)->type;
459         const int code = icmp_hdr(skb)->code;
460         struct ip_tunnel *t;
461         __be16 flags;
462
463         flags = p[0];
464         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
465                 if (flags&(GRE_VERSION|GRE_ROUTING))
466                         return;
467                 if (flags&GRE_KEY) {
468                         grehlen += 4;
469                         if (flags&GRE_CSUM)
470                                 grehlen += 4;
471                 }
472         }
473
474         /* If only 8 bytes returned, keyed message will be dropped here */
475         if (skb_headlen(skb) < grehlen)
476                 return;
477
478         switch (type) {
479         default:
480         case ICMP_PARAMETERPROB:
481                 return;
482
483         case ICMP_DEST_UNREACH:
484                 switch (code) {
485                 case ICMP_SR_FAILED:
486                 case ICMP_PORT_UNREACH:
487                         /* Impossible event. */
488                         return;
489                 case ICMP_FRAG_NEEDED:
490                         /* Soft state for pmtu is maintained by IP core. */
491                         return;
492                 default:
493                         /* All others are translated to HOST_UNREACH.
494                            rfc2003 contains "deep thoughts" about NET_UNREACH,
495                            I believe they are just ether pollution. --ANK
496                          */
497                         break;
498                 }
499                 break;
500         case ICMP_TIME_EXCEEDED:
501                 if (code != ICMP_EXC_TTL)
502                         return;
503                 break;
504         }
505
506         read_lock(&ipgre_lock);
507         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
508                                 flags & GRE_KEY ?
509                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
510                                 p[1]);
511         if (t == NULL || t->parms.iph.daddr == 0 ||
512             ipv4_is_multicast(t->parms.iph.daddr))
513                 goto out;
514
515         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
516                 goto out;
517
518         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
519                 t->err_count++;
520         else
521                 t->err_count = 1;
522         t->err_time = jiffies;
523 out:
524         read_unlock(&ipgre_lock);
525         return;
526 }
527
528 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
529 {
530         if (INET_ECN_is_ce(iph->tos)) {
531                 if (skb->protocol == htons(ETH_P_IP)) {
532                         IP_ECN_set_ce(ip_hdr(skb));
533                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
534                         IP6_ECN_set_ce(ipv6_hdr(skb));
535                 }
536         }
537 }
538
539 static inline u8
540 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
541 {
542         u8 inner = 0;
543         if (skb->protocol == htons(ETH_P_IP))
544                 inner = old_iph->tos;
545         else if (skb->protocol == htons(ETH_P_IPV6))
546                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
547         return INET_ECN_encapsulate(tos, inner);
548 }
549
550 static int ipgre_rcv(struct sk_buff *skb)
551 {
552         struct iphdr *iph;
553         u8     *h;
554         __be16    flags;
555         __sum16   csum = 0;
556         __be32 key = 0;
557         u32    seqno = 0;
558         struct ip_tunnel *tunnel;
559         int    offset = 4;
560         __be16 gre_proto;
561         unsigned int len;
562
563         if (!pskb_may_pull(skb, 16))
564                 goto drop_nolock;
565
566         iph = ip_hdr(skb);
567         h = skb->data;
568         flags = *(__be16*)h;
569
570         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
571                 /* - Version must be 0.
572                    - We do not support routing headers.
573                  */
574                 if (flags&(GRE_VERSION|GRE_ROUTING))
575                         goto drop_nolock;
576
577                 if (flags&GRE_CSUM) {
578                         switch (skb->ip_summed) {
579                         case CHECKSUM_COMPLETE:
580                                 csum = csum_fold(skb->csum);
581                                 if (!csum)
582                                         break;
583                                 /* fall through */
584                         case CHECKSUM_NONE:
585                                 skb->csum = 0;
586                                 csum = __skb_checksum_complete(skb);
587                                 skb->ip_summed = CHECKSUM_COMPLETE;
588                         }
589                         offset += 4;
590                 }
591                 if (flags&GRE_KEY) {
592                         key = *(__be32*)(h + offset);
593                         offset += 4;
594                 }
595                 if (flags&GRE_SEQ) {
596                         seqno = ntohl(*(__be32*)(h + offset));
597                         offset += 4;
598                 }
599         }
600
601         gre_proto = *(__be16 *)(h + 2);
602
603         read_lock(&ipgre_lock);
604         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
605                                           iph->saddr, iph->daddr, key,
606                                           gre_proto))) {
607                 struct net_device_stats *stats;
608 #ifdef HAVE_NETDEV_STATS
609                 stats = &tunnel->dev->stats;
610 #else
611                 stats = &tunnel->stat;
612 #endif
613
614                 secpath_reset(skb);
615
616                 skb->protocol = gre_proto;
617                 /* WCCP version 1 and 2 protocol decoding.
618                  * - Change protocol to IP
619                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
620                  */
621                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
622                         skb->protocol = htons(ETH_P_IP);
623                         if ((*(h + offset) & 0xF0) != 0x40)
624                                 offset += 4;
625                 }
626
627                 skb->mac_header = skb->network_header;
628                 __pskb_pull(skb, offset);
629                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
630                 skb->pkt_type = PACKET_HOST;
631 #ifdef CONFIG_NET_IPGRE_BROADCAST
632                 if (ipv4_is_multicast(iph->daddr)) {
633                         /* Looped back packet, drop it! */
634                         if (skb_rtable(skb)->fl.iif == 0)
635                                 goto drop;
636                         stats->multicast++;
637                         skb->pkt_type = PACKET_BROADCAST;
638                 }
639 #endif
640
641                 if (((flags&GRE_CSUM) && csum) ||
642                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
643                         stats->rx_crc_errors++;
644                         stats->rx_errors++;
645                         goto drop;
646                 }
647                 if (tunnel->parms.i_flags&GRE_SEQ) {
648                         if (!(flags&GRE_SEQ) ||
649                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
650                                 stats->rx_fifo_errors++;
651                                 stats->rx_errors++;
652                                 goto drop;
653                         }
654                         tunnel->i_seqno = seqno + 1;
655                 }
656
657                 len = skb->len;
658
659                 /* Warning: All skb pointers will be invalidated! */
660                 if (tunnel->dev->type == ARPHRD_ETHER) {
661                         if (!pskb_may_pull(skb, ETH_HLEN)) {
662                                 stats->rx_length_errors++;
663                                 stats->rx_errors++;
664                                 goto drop;
665                         }
666
667                         iph = ip_hdr(skb);
668                         skb->protocol = eth_type_trans(skb, tunnel->dev);
669                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
670                 }
671
672                 stats->rx_packets++;
673                 stats->rx_bytes += len;
674                 skb->dev = tunnel->dev;
675                 skb_dst_drop(skb);
676                 nf_reset(skb);
677
678                 skb_reset_network_header(skb);
679                 ipgre_ecn_decapsulate(iph, skb);
680
681 #ifdef CHECKSUM_HW
682                 /* XXX: Temporary workaround to avoid a panic when doing
683                  * bridging due to multiple meanings of CHECKSUM_HW. */
684                 if (skb->ip_summed == CHECKSUM_HW)
685                         skb->ip_summed = CHECKSUM_NONE;
686 #endif
687
688                 netif_rx(skb);
689                 read_unlock(&ipgre_lock);
690                 return(0);
691         }
692         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
693
694 drop:
695         read_unlock(&ipgre_lock);
696 drop_nolock:
697         kfree_skb(skb);
698         return(0);
699 }
700
701 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
702 {
703         struct ip_tunnel *tunnel = netdev_priv(dev);
704         struct net_device_stats *stats;
705         struct iphdr  *old_iph = ip_hdr(skb);
706         struct iphdr  *tiph;
707         u8     tos;
708         __be16 df;
709         struct rtable *rt;                      /* Route to the other host */
710         struct net_device *tdev;                /* Device to other host */
711         struct iphdr  *iph;                     /* Our new IP header */
712         unsigned int max_headroom;              /* The extra header space needed */
713         int    gre_hlen;
714         __be32 dst;
715         int    mtu;
716
717 #ifdef HAVE_NETDEV_STATS
718         stats = &tunnel->dev->stats;
719 #else
720         stats = &tunnel->stat;
721 #endif
722
723         if (dev->type == ARPHRD_ETHER)
724                 IPCB(skb)->flags = 0;
725
726 #ifdef HAVE_NETDEV_HEADER_OPS
727         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
728 #else
729         if (dev->hard_header && dev->type == ARPHRD_IPGRE) {
730 #endif
731                 gre_hlen = 0;
732                 tiph = (struct iphdr *)skb->data;
733         } else {
734                 gre_hlen = tunnel->hlen;
735                 tiph = &tunnel->parms.iph;
736         }
737
738         if ((dst = tiph->daddr) == 0) {
739                 /* NBMA tunnel */
740
741                 if (skb_dst(skb) == NULL) {
742                         stats->tx_fifo_errors++;
743                         goto tx_error;
744                 }
745
746                 if (skb->protocol == htons(ETH_P_IP)) {
747                         rt = skb_rtable(skb);
748                         if ((dst = rt->rt_gateway) == 0)
749                                 goto tx_error_icmp;
750                 }
751 #ifdef CONFIG_IPV6
752                 else if (skb->protocol == htons(ETH_P_IPV6)) {
753                         struct in6_addr *addr6;
754                         int addr_type;
755                         struct neighbour *neigh = skb_dst(skb)->neighbour;
756
757                         if (neigh == NULL)
758                                 goto tx_error;
759
760                         addr6 = (struct in6_addr *)&neigh->primary_key;
761                         addr_type = ipv6_addr_type(addr6);
762
763                         if (addr_type == IPV6_ADDR_ANY) {
764                                 addr6 = &ipv6_hdr(skb)->daddr;
765                                 addr_type = ipv6_addr_type(addr6);
766                         }
767
768                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
769                                 goto tx_error_icmp;
770
771                         dst = addr6->s6_addr32[3];
772                 }
773 #endif
774                 else
775                         goto tx_error;
776         }
777
778         tos = tiph->tos;
779         if (tos == 1) {
780                 tos = 0;
781                 if (skb->protocol == htons(ETH_P_IP))
782                         tos = old_iph->tos;
783         }
784
785         {
786                 struct flowi fl = { .oif = tunnel->parms.link,
787                                     .nl_u = { .ip4_u =
788                                               { .daddr = dst,
789                                                 .saddr = tiph->saddr,
790                                                 .tos = RT_TOS(tos) } },
791                                     .proto = IPPROTO_GRE };
792                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
793                         stats->tx_carrier_errors++;
794                         goto tx_error;
795                 }
796         }
797         tdev = rt->u.dst.dev;
798
799         if (tdev == dev) {
800                 ip_rt_put(rt);
801                 stats->collisions++;
802                 goto tx_error;
803         }
804
805         df = tiph->frag_off;
806         if (df)
807 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
808                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
809 #else
810                 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
811 #endif
812         else
813                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
814
815         if (skb_dst(skb))
816                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
817
818         if (skb->protocol == htons(ETH_P_IP)) {
819                 df |= (old_iph->frag_off&htons(IP_DF));
820
821                 if ((old_iph->frag_off&htons(IP_DF)) &&
822                     mtu < ntohs(old_iph->tot_len)) {
823                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
824                         ip_rt_put(rt);
825                         goto tx_error;
826                 }
827         }
828 #ifdef CONFIG_IPV6
829         else if (skb->protocol == htons(ETH_P_IPV6)) {
830                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
831
832                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
833                         if ((tunnel->parms.iph.daddr &&
834                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
835                             rt6->rt6i_dst.plen == 128) {
836                                 rt6->rt6i_flags |= RTF_MODIFIED;
837                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
838                         }
839                 }
840
841                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
842                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
843                         ip_rt_put(rt);
844                         goto tx_error;
845                 }
846         }
847 #endif
848
849         if (tunnel->err_count > 0) {
850                 if (time_before(jiffies,
851                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
852                         tunnel->err_count--;
853
854                         dst_link_failure(skb);
855                 } else
856                         tunnel->err_count = 0;
857         }
858
859         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
860
861         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
862             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
863                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
864                 if (!new_skb) {
865                         ip_rt_put(rt);
866                         stats->tx_dropped++;
867                         dev_kfree_skb(skb);
868                         return NETDEV_TX_OK;
869                 }
870                 if (skb->sk)
871                         skb_set_owner_w(new_skb, skb->sk);
872                 dev_kfree_skb(skb);
873                 skb = new_skb;
874                 old_iph = ip_hdr(skb);
875         }
876
877         skb_reset_transport_header(skb);
878         skb_push(skb, gre_hlen);
879         skb_reset_network_header(skb);
880         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
881         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
882                               IPSKB_REROUTED);
883         skb_dst_drop(skb);
884         skb_dst_set(skb, &rt->u.dst);
885
886         /*
887          *      Push down and install the IPIP header.
888          */
889
890         iph                     =       ip_hdr(skb);
891         iph->version            =       4;
892         iph->ihl                =       sizeof(struct iphdr) >> 2;
893         iph->frag_off           =       df;
894         iph->protocol           =       IPPROTO_GRE;
895         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
896         iph->daddr              =       rt->rt_dst;
897         iph->saddr              =       rt->rt_src;
898
899         if ((iph->ttl = tiph->ttl) == 0) {
900                 if (skb->protocol == htons(ETH_P_IP))
901                         iph->ttl = old_iph->ttl;
902 #ifdef CONFIG_IPV6
903                 else if (skb->protocol == htons(ETH_P_IPV6))
904                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
905 #endif
906                 else
907                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
908         }
909
910         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
911         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
912                                    htons(ETH_P_TEB) : skb->protocol;
913
914         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
915                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
916
917                 if (tunnel->parms.o_flags&GRE_SEQ) {
918                         ++tunnel->o_seqno;
919                         *ptr = htonl(tunnel->o_seqno);
920                         ptr--;
921                 }
922                 if (tunnel->parms.o_flags&GRE_KEY) {
923                         *ptr = tunnel->parms.o_key;
924                         ptr--;
925                 }
926                 if (tunnel->parms.o_flags&GRE_CSUM) {
927                         *ptr = 0;
928                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
929                 }
930         }
931
932         nf_reset(skb);
933
934         IPTUNNEL_XMIT();
935         return NETDEV_TX_OK;
936
937 tx_error_icmp:
938         dst_link_failure(skb);
939
940 tx_error:
941         stats->tx_errors++;
942         dev_kfree_skb(skb);
943         return NETDEV_TX_OK;
944 }
945
946 static int ipgre_tunnel_bind_dev(struct net_device *dev)
947 {
948         struct net_device *tdev = NULL;
949         struct ip_tunnel *tunnel;
950         struct iphdr *iph;
951         int hlen = LL_MAX_HEADER;
952         int mtu = ETH_DATA_LEN;
953         int addend = sizeof(struct iphdr) + 4;
954
955         tunnel = netdev_priv(dev);
956         iph = &tunnel->parms.iph;
957
958         /* Guess output device to choose reasonable mtu and needed_headroom */
959
960         if (iph->daddr) {
961                 struct flowi fl = { .oif = tunnel->parms.link,
962                                     .nl_u = { .ip4_u =
963                                               { .daddr = iph->daddr,
964                                                 .saddr = iph->saddr,
965                                                 .tos = RT_TOS(iph->tos) } },
966                                     .proto = IPPROTO_GRE };
967                 struct rtable *rt;
968                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
969                         tdev = rt->u.dst.dev;
970                         ip_rt_put(rt);
971                 }
972
973                 if (dev->type != ARPHRD_ETHER)
974                         dev->flags |= IFF_POINTOPOINT;
975         }
976
977         if (!tdev && tunnel->parms.link)
978                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
979
980         if (tdev) {
981 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
982                 hlen = tdev->hard_header_len + tdev->needed_headroom;
983 #else
984                 hlen = tdev->hard_header_len;
985 #endif
986                 mtu = tdev->mtu;
987         }
988         dev->iflink = tunnel->parms.link;
989
990         /* Precalculate GRE options length */
991         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
992                 if (tunnel->parms.o_flags&GRE_CSUM)
993                         addend += 4;
994                 if (tunnel->parms.o_flags&GRE_KEY)
995                         addend += 4;
996                 if (tunnel->parms.o_flags&GRE_SEQ)
997                         addend += 4;
998         }
999 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1000         dev->needed_headroom = hlen + addend;
1001         mtu -= dev->hard_header_len + addend;
1002 #else
1003         dev->hard_header_len = hlen + addend;
1004         mtu -= addend;
1005 #endif
1006         tunnel->hlen = addend;
1007
1008         if (mtu < 68)
1009                 mtu = 68;
1010
1011         return mtu;
1012 }
1013
1014 static int
1015 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1016 {
1017         int err = 0;
1018         struct ip_tunnel_parm p;
1019         struct ip_tunnel *t;
1020         struct net *net = dev_net(dev);
1021         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1022         int add_tunnel, gretap;
1023
1024         switch (cmd) {
1025         case SIOCGETTUNNEL:
1026                 t = NULL;
1027                 if (dev == ign->fb_tunnel_dev) {
1028                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1029                                 err = -EFAULT;
1030                                 break;
1031                         }
1032                         t = ipgre_tunnel_locate(net, &p, false, 0);
1033                 }
1034                 if (t == NULL)
1035                         t = netdev_priv(dev);
1036                 memcpy(&p, &t->parms, sizeof(p));
1037                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1038                         err = -EFAULT;
1039                 break;
1040
1041         case SIOCADDTUNNEL:
1042         case SIOCCHGTUNNEL:
1043         case SIOCADDGRETAP:
1044         case SIOCCHGGRETAP:
1045                 err = -EPERM;
1046                 if (!capable(CAP_NET_ADMIN))
1047                         goto done;
1048
1049                 err = -EFAULT;
1050                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1051                         goto done;
1052
1053                 err = -EINVAL;
1054                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1055                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1056                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1057                         goto done;
1058
1059                 add_tunnel = (cmd == SIOCADDTUNNEL || cmd == SIOCADDGRETAP);
1060                 gretap = (cmd == SIOCADDGRETAP || cmd == SIOCCHGGRETAP);
1061
1062                 if (p.iph.ttl)
1063                         p.iph.frag_off |= htons(IP_DF);
1064
1065                 if (!(p.i_flags&GRE_KEY))
1066                         p.i_key = 0;
1067                 if (!(p.o_flags&GRE_KEY))
1068                         p.o_key = 0;
1069
1070                 t = ipgre_tunnel_locate(net, &p, gretap, add_tunnel);
1071
1072                 if (dev != ign->fb_tunnel_dev && !add_tunnel) {
1073                         if (t != NULL) {
1074                                 if (t->dev != dev) {
1075                                         err = -EEXIST;
1076                                         break;
1077                                 }
1078                         } else {
1079                                 unsigned nflags = 0;
1080
1081                                 t = netdev_priv(dev);
1082
1083                                 if (ipv4_is_multicast(p.iph.daddr))
1084                                         nflags = IFF_BROADCAST;
1085                                 else if (p.iph.daddr)
1086                                         nflags = IFF_POINTOPOINT;
1087
1088                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1089                                         err = -EINVAL;
1090                                         break;
1091                                 }
1092                                 ipgre_tunnel_unlink(ign, t);
1093                                 t->parms.iph.saddr = p.iph.saddr;
1094                                 t->parms.iph.daddr = p.iph.daddr;
1095                                 t->parms.i_key = p.i_key;
1096                                 t->parms.o_key = p.o_key;
1097                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1098                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1099                                 ipgre_tunnel_link(ign, t);
1100                                 netdev_state_change(dev);
1101                         }
1102                 }
1103
1104                 if (t) {
1105                         err = 0;
1106                         if (!add_tunnel) {
1107                                 t->parms.iph.ttl = p.iph.ttl;
1108                                 t->parms.iph.tos = p.iph.tos;
1109                                 t->parms.iph.frag_off = p.iph.frag_off;
1110                                 if (t->parms.link != p.link) {
1111                                         t->parms.link = p.link;
1112                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1113                                         netdev_state_change(dev);
1114                                 }
1115                         }
1116                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1117                                 err = -EFAULT;
1118                 } else
1119                         err = (add_tunnel ? -ENOBUFS : -ENOENT);
1120                 break;
1121
1122         case SIOCDELTUNNEL:
1123                 err = -EPERM;
1124                 if (!capable(CAP_NET_ADMIN))
1125                         goto done;
1126
1127                 if (dev == ign->fb_tunnel_dev) {
1128                         err = -EFAULT;
1129                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1130                                 goto done;
1131                         err = -ENOENT;
1132                         if ((t = ipgre_tunnel_locate(net, &p, false, 0)) == NULL)
1133                                 goto done;
1134                         err = -EPERM;
1135                         if (t == netdev_priv(ign->fb_tunnel_dev))
1136                                 goto done;
1137                         dev = t->dev;
1138                 }
1139                 unregister_netdevice(dev);
1140                 err = 0;
1141                 break;
1142
1143         default:
1144                 err = -EINVAL;
1145         }
1146
1147 done:
1148         return err;
1149 }
1150
1151 #ifndef HAVE_NETDEV_STATS
1152 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1153 {
1154         return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1155 }
1156 #endif
1157
1158 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1159 {
1160         struct ip_tunnel *tunnel = netdev_priv(dev);
1161         if (new_mtu < 68 ||
1162 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1163         new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1164 #else
1165         new_mtu > 0xFFF8 - tunnel->hlen)
1166 #endif
1167                 return -EINVAL;
1168         dev->mtu = new_mtu;
1169         return 0;
1170 }
1171
1172 /* Nice toy. Unfortunately, useless in real life :-)
1173    It allows to construct virtual multiprotocol broadcast "LAN"
1174    over the Internet, provided multicast routing is tuned.
1175
1176
1177    I have no idea was this bicycle invented before me,
1178    so that I had to set ARPHRD_IPGRE to a random value.
1179    I have an impression, that Cisco could make something similar,
1180    but this feature is apparently missing in IOS<=11.2(8).
1181
1182    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1183    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1184
1185    ping -t 255 224.66.66.66
1186
1187    If nobody answers, mbone does not work.
1188
1189    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1190    ip addr add 10.66.66.<somewhat>/24 dev Universe
1191    ifconfig Universe up
1192    ifconfig Universe add fe80::<Your_real_addr>/10
1193    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1194    ftp 10.66.66.66
1195    ...
1196    ftp fec0:6666:6666::193.233.7.65
1197    ...
1198
1199  */
1200
1201 #ifdef HAVE_NETDEV_HEADER_OPS
1202 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1203                        unsigned short type,
1204                        const void *daddr, const void *saddr, unsigned len)
1205 #else
1206 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1207                         void *daddr, void *saddr, unsigned len)
1208 #endif
1209 {
1210         struct ip_tunnel *t = netdev_priv(dev);
1211         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1212         __be16 *p = (__be16*)(iph+1);
1213
1214         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1215         p[0]            = t->parms.o_flags;
1216         p[1]            = htons(type);
1217
1218         /*
1219          *      Set the source hardware address.
1220          */
1221
1222         if (saddr)
1223                 memcpy(&iph->saddr, saddr, 4);
1224
1225         if (daddr) {
1226                 memcpy(&iph->daddr, daddr, 4);
1227                 return t->hlen;
1228         }
1229         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1230                 return t->hlen;
1231
1232         return -t->hlen;
1233 }
1234
1235 #ifdef HAVE_NETDEV_HEADER_OPS
1236 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1237 #else
1238 static int ipgre_header_parse(struct sk_buff *skb, unsigned char *haddr)
1239 #endif
1240 {
1241         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1242         memcpy(haddr, &iph->saddr, 4);
1243         return 4;
1244 }
1245
1246 #ifdef HAVE_NETDEV_HEADER_OPS
1247 static const struct header_ops ipgre_header_ops = {
1248         .create = ipgre_header,
1249         .parse  = ipgre_header_parse,
1250 };
1251 #endif
1252
1253 #ifdef CONFIG_NET_IPGRE_BROADCAST
1254 static int ipgre_open(struct net_device *dev)
1255 {
1256         struct ip_tunnel *t = netdev_priv(dev);
1257
1258         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1259                 struct flowi fl = { .oif = t->parms.link,
1260                                     .nl_u = { .ip4_u =
1261                                               { .daddr = t->parms.iph.daddr,
1262                                                 .saddr = t->parms.iph.saddr,
1263                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1264                                     .proto = IPPROTO_GRE };
1265                 struct rtable *rt;
1266                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1267                         return -EADDRNOTAVAIL;
1268                 dev = rt->u.dst.dev;
1269                 ip_rt_put(rt);
1270                 if (__in_dev_get_rtnl(dev) == NULL)
1271                         return -EADDRNOTAVAIL;
1272                 t->mlink = dev->ifindex;
1273                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1274         }
1275         return 0;
1276 }
1277
1278 static int ipgre_close(struct net_device *dev)
1279 {
1280         struct ip_tunnel *t = netdev_priv(dev);
1281
1282         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1283                 struct in_device *in_dev;
1284                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1285                 if (in_dev) {
1286                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1287                         in_dev_put(in_dev);
1288                 }
1289         }
1290         return 0;
1291 }
1292
1293 #endif
1294
1295 #ifdef HAVE_NET_DEVICE_OPS
1296 static const struct net_device_ops ipgre_netdev_ops = {
1297         .ndo_init               = ipgre_tunnel_init,
1298         .ndo_uninit             = ipgre_tunnel_uninit,
1299 #ifdef CONFIG_NET_IPGRE_BROADCAST
1300         .ndo_open               = ipgre_open,
1301         .ndo_stop               = ipgre_close,
1302 #endif
1303         .ndo_start_xmit         = ipgre_tunnel_xmit,
1304         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1305         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1306 };
1307 #endif
1308
1309 static void ipgre_tunnel_setup(struct net_device *dev)
1310 {
1311 #ifdef HAVE_NET_DEVICE_OPS
1312         dev->netdev_ops         = &ipgre_netdev_ops;
1313 #else
1314         dev->init               = ipgre_tunnel_init;
1315         dev->uninit             = ipgre_tunnel_uninit;
1316         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1317 #ifndef HAVE_NETDEV_STATS
1318         dev->get_stats          = ipgre_tunnel_get_stats;
1319 #endif
1320         dev->do_ioctl           = ipgre_tunnel_ioctl;
1321         dev->change_mtu         = ipgre_tunnel_change_mtu;
1322 #endif /* HAVE_NET_DEVICE_OPS */
1323         dev->destructor         = free_netdev;
1324
1325         dev->type               = ARPHRD_IPGRE;
1326 #ifdef HAVE_NETDEV_NEEDED_HEADROOM
1327         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1328 #else
1329         dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1330 #endif
1331         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1332         dev->flags              = IFF_NOARP;
1333         dev->iflink             = 0;
1334         dev->addr_len           = 4;
1335         dev->features           |= NETIF_F_NETNS_LOCAL;
1336         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1337 }
1338
1339 static int ipgre_tunnel_init(struct net_device *dev)
1340 {
1341         struct ip_tunnel *tunnel;
1342         struct iphdr *iph;
1343
1344         tunnel = netdev_priv(dev);
1345         iph = &tunnel->parms.iph;
1346
1347         tunnel->dev = dev;
1348         strcpy(tunnel->parms.name, dev->name);
1349
1350         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1351         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1352
1353         if (iph->daddr) {
1354 #ifdef CONFIG_NET_IPGRE_BROADCAST
1355                 if (ipv4_is_multicast(iph->daddr)) {
1356                         if (!iph->saddr)
1357                                 return -EINVAL;
1358                         dev->flags = IFF_BROADCAST;
1359 #ifdef HAVE_NETDEV_HEADER_OPS
1360                         dev->header_ops = &ipgre_header_ops;
1361 #else
1362                         dev->hard_header = ipgre_header;
1363                         dev->hard_header_parse = ipgre_header_parse;
1364 #endif
1365 #ifndef HAVE_NET_DEVICE_OPS
1366                         dev->open = ipgre_open;
1367                         dev->stop = ipgre_close;
1368 #endif
1369                 }
1370 #endif
1371         } else {
1372 #ifdef HAVE_NETDEV_HEADER_OPS
1373                 dev->header_ops = &ipgre_header_ops;
1374 #else
1375                 dev->hard_header = ipgre_header;
1376                 dev->hard_header_parse = ipgre_header_parse;
1377 #endif
1378         }
1379
1380         return 0;
1381 }
1382
1383 #ifdef HAVE_NET_DEVICE_OPS
1384 static void ipgre_fb_tunnel_init(struct net_device *dev)
1385 #else
1386 static int ipgre_fb_tunnel_init(struct net_device *dev)
1387 #endif
1388 {
1389         struct ip_tunnel *tunnel = netdev_priv(dev);
1390         struct iphdr *iph = &tunnel->parms.iph;
1391         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1392
1393         tunnel->dev = dev;
1394         strcpy(tunnel->parms.name, dev->name);
1395
1396         iph->version            = 4;
1397         iph->protocol           = IPPROTO_GRE;
1398         iph->ihl                = 5;
1399         tunnel->hlen            = sizeof(struct iphdr) + 4;
1400
1401         dev_hold(dev);
1402         ign->tunnels_wc[0]      = tunnel;
1403
1404 #ifndef HAVE_NET_DEVICE_OPS
1405         return 0;
1406 #endif
1407 }
1408
1409 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
1410 static struct net_protocol ipgre_protocol = {
1411 #else
1412 static const struct net_protocol ipgre_protocol = {
1413 #endif
1414         .handler        =       ipgre_rcv,
1415         .err_handler    =       ipgre_err,
1416 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
1417         .netns_ok       =       1,
1418 #endif
1419 };
1420
1421 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1422 {
1423         int prio;
1424
1425         for (prio = 0; prio < 4; prio++) {
1426                 int h;
1427                 for (h = 0; h < HASH_SIZE; h++) {
1428                         struct ip_tunnel *t;
1429                         while ((t = ign->tunnels[prio][h]) != NULL)
1430                                 unregister_netdevice(t->dev);
1431                 }
1432         }
1433 }
1434
1435 static int ipgre_init_net(struct net *net)
1436 {
1437         int err;
1438         struct ipgre_net *ign;
1439
1440         err = -ENOMEM;
1441         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1442         if (ign == NULL)
1443                 goto err_alloc;
1444
1445         err = net_assign_generic(net, ipgre_net_id, ign);
1446         if (err < 0)
1447                 goto err_assign;
1448
1449         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), GRE_IOCTL_DEVICE,
1450                                            ipgre_tunnel_setup);
1451         if (!ign->fb_tunnel_dev) {
1452                 err = -ENOMEM;
1453                 goto err_alloc_dev;
1454         }
1455         dev_net_set(ign->fb_tunnel_dev, net);
1456
1457 #ifdef HAVE_NET_DEVICE_OPS
1458         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1459 #else
1460         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1461 #endif
1462 #ifndef GRE_IOCTL_ONLY
1463         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1464 #endif
1465
1466         if ((err = register_netdev(ign->fb_tunnel_dev)))
1467                 goto err_reg_dev;
1468
1469         return 0;
1470
1471 err_reg_dev:
1472         free_netdev(ign->fb_tunnel_dev);
1473 err_alloc_dev:
1474         /* nothing */
1475 err_assign:
1476         kfree(ign);
1477 err_alloc:
1478         return err;
1479 }
1480
1481 static void ipgre_exit_net(struct net *net)
1482 {
1483         struct ipgre_net *ign;
1484
1485         ign = net_generic(net, ipgre_net_id);
1486         rtnl_lock();
1487         ipgre_destroy_tunnels(ign);
1488         rtnl_unlock();
1489         kfree(ign);
1490 }
1491
1492 static struct pernet_operations ipgre_net_ops = {
1493         .init = ipgre_init_net,
1494         .exit = ipgre_exit_net,
1495 };
1496
1497 static int ipgre_tap_init(struct net_device *dev)
1498 {
1499         struct ip_tunnel *tunnel;
1500
1501         tunnel = netdev_priv(dev);
1502
1503         tunnel->dev = dev;
1504         strcpy(tunnel->parms.name, dev->name);
1505
1506         ipgre_tunnel_bind_dev(dev);
1507
1508         return 0;
1509 }
1510
1511 #ifdef HAVE_NET_DEVICE_OPS
1512 static const struct net_device_ops ipgre_tap_netdev_ops = {
1513         .ndo_init               = ipgre_tap_init,
1514         .ndo_uninit             = ipgre_tunnel_uninit,
1515         .ndo_start_xmit         = ipgre_tunnel_xmit,
1516         .ndo_set_mac_address    = eth_mac_addr,
1517         .ndo_validate_addr      = eth_validate_addr,
1518         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1519         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1520 };
1521 #endif
1522
1523 static void ipgre_tap_setup(struct net_device *dev)
1524 {
1525         ether_setup(dev);
1526
1527 #ifdef HAVE_NET_DEVICE_OPS
1528         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1529 #else
1530         dev->init               = ipgre_tap_init;
1531         dev->uninit             = ipgre_tunnel_uninit;
1532         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1533 #ifndef HAVE_NETDEV_STATS
1534         dev->get_stats          = ipgre_tunnel_get_stats;
1535 #endif
1536         dev->do_ioctl           = ipgre_tunnel_ioctl;
1537         dev->change_mtu         = ipgre_tunnel_change_mtu;
1538 #endif /* HAVE_NET_DEVICE_OPS */
1539         dev->destructor         = free_netdev;
1540
1541         dev->iflink             = 0;
1542         dev->features           |= NETIF_F_NETNS_LOCAL;
1543 }
1544
1545 #ifndef GRE_IOCTL_ONLY
1546 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1547 {
1548         __be16 flags;
1549
1550         if (!data)
1551                 return 0;
1552
1553         flags = 0;
1554         if (data[IFLA_GRE_IFLAGS])
1555                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1556         if (data[IFLA_GRE_OFLAGS])
1557                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1558         if (flags & (GRE_VERSION|GRE_ROUTING))
1559                 return -EINVAL;
1560
1561         return 0;
1562 }
1563
1564 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1565 {
1566         __be32 daddr;
1567
1568         if (tb[IFLA_ADDRESS]) {
1569                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1570                         return -EINVAL;
1571                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1572                         return -EADDRNOTAVAIL;
1573         }
1574
1575         if (!data)
1576                 goto out;
1577
1578         if (data[IFLA_GRE_REMOTE]) {
1579                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1580                 if (!daddr)
1581                         return -EINVAL;
1582         }
1583
1584 out:
1585         return ipgre_tunnel_validate(tb, data);
1586 }
1587
1588 static void ipgre_netlink_parms(struct nlattr *data[],
1589                                 struct ip_tunnel_parm *parms)
1590 {
1591         memset(parms, 0, sizeof(*parms));
1592
1593         parms->iph.protocol = IPPROTO_GRE;
1594
1595         if (!data)
1596                 return;
1597
1598         if (data[IFLA_GRE_LINK])
1599                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1600
1601         if (data[IFLA_GRE_IFLAGS])
1602                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1603
1604         if (data[IFLA_GRE_OFLAGS])
1605                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1606
1607         if (data[IFLA_GRE_IKEY])
1608                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1609
1610         if (data[IFLA_GRE_OKEY])
1611                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1612
1613         if (data[IFLA_GRE_LOCAL])
1614                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1615
1616         if (data[IFLA_GRE_REMOTE])
1617                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1618
1619         if (data[IFLA_GRE_TTL])
1620                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1621
1622         if (data[IFLA_GRE_TOS])
1623                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1624
1625         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1626                 parms->iph.frag_off = htons(IP_DF);
1627 }
1628
1629 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1630                          struct nlattr *data[])
1631 {
1632         struct ip_tunnel *nt;
1633         struct net *net = dev_net(dev);
1634         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1635         int mtu;
1636         int err;
1637
1638         nt = netdev_priv(dev);
1639         ipgre_netlink_parms(data, &nt->parms);
1640
1641         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1642                 return -EEXIST;
1643
1644         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1645                 random_ether_addr(dev->dev_addr);
1646
1647         mtu = ipgre_tunnel_bind_dev(dev);
1648         if (!tb[IFLA_MTU])
1649                 dev->mtu = mtu;
1650
1651         err = register_netdevice(dev);
1652         if (err)
1653                 goto out;
1654
1655         dev_hold(dev);
1656         ipgre_tunnel_link(ign, nt);
1657
1658 out:
1659         return err;
1660 }
1661
1662 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1663                             struct nlattr *data[])
1664 {
1665         struct ip_tunnel *t, *nt;
1666         struct net *net = dev_net(dev);
1667         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1668         struct ip_tunnel_parm p;
1669         int mtu;
1670
1671         if (dev == ign->fb_tunnel_dev)
1672                 return -EINVAL;
1673
1674         nt = netdev_priv(dev);
1675         ipgre_netlink_parms(data, &p);
1676
1677         t = ipgre_tunnel_locate(net, &p, false, 0);
1678
1679         if (t) {
1680                 if (t->dev != dev)
1681                         return -EEXIST;
1682         } else {
1683                 t = nt;
1684
1685                 if (dev->type != ARPHRD_ETHER) {
1686                         unsigned nflags = 0;
1687
1688                         if (ipv4_is_multicast(p.iph.daddr))
1689                                 nflags = IFF_BROADCAST;
1690                         else if (p.iph.daddr)
1691                                 nflags = IFF_POINTOPOINT;
1692
1693                         if ((dev->flags ^ nflags) &
1694                             (IFF_POINTOPOINT | IFF_BROADCAST))
1695                                 return -EINVAL;
1696                 }
1697
1698                 ipgre_tunnel_unlink(ign, t);
1699                 t->parms.iph.saddr = p.iph.saddr;
1700                 t->parms.iph.daddr = p.iph.daddr;
1701                 t->parms.i_key = p.i_key;
1702                 if (dev->type != ARPHRD_ETHER) {
1703                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1704                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1705                 }
1706                 ipgre_tunnel_link(ign, t);
1707                 netdev_state_change(dev);
1708         }
1709
1710         t->parms.o_key = p.o_key;
1711         t->parms.iph.ttl = p.iph.ttl;
1712         t->parms.iph.tos = p.iph.tos;
1713         t->parms.iph.frag_off = p.iph.frag_off;
1714
1715         if (t->parms.link != p.link) {
1716                 t->parms.link = p.link;
1717                 mtu = ipgre_tunnel_bind_dev(dev);
1718                 if (!tb[IFLA_MTU])
1719                         dev->mtu = mtu;
1720                 netdev_state_change(dev);
1721         }
1722
1723         return 0;
1724 }
1725
1726 static size_t ipgre_get_size(const struct net_device *dev)
1727 {
1728         return
1729                 /* IFLA_GRE_LINK */
1730                 nla_total_size(4) +
1731                 /* IFLA_GRE_IFLAGS */
1732                 nla_total_size(2) +
1733                 /* IFLA_GRE_OFLAGS */
1734                 nla_total_size(2) +
1735                 /* IFLA_GRE_IKEY */
1736                 nla_total_size(4) +
1737                 /* IFLA_GRE_OKEY */
1738                 nla_total_size(4) +
1739                 /* IFLA_GRE_LOCAL */
1740                 nla_total_size(4) +
1741                 /* IFLA_GRE_REMOTE */
1742                 nla_total_size(4) +
1743                 /* IFLA_GRE_TTL */
1744                 nla_total_size(1) +
1745                 /* IFLA_GRE_TOS */
1746                 nla_total_size(1) +
1747                 /* IFLA_GRE_PMTUDISC */
1748                 nla_total_size(1) +
1749                 0;
1750 }
1751
1752 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1753 {
1754         struct ip_tunnel *t = netdev_priv(dev);
1755         struct ip_tunnel_parm *p = &t->parms;
1756
1757         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1758         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1759         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1760         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1761         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1762         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1763         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1764         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1765         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1766         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1767
1768         return 0;
1769
1770 nla_put_failure:
1771         return -EMSGSIZE;
1772 }
1773
1774 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1775         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1776         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1777         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1778         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1779         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1780         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1781         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1782         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1783         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1784         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1785 };
1786
1787 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1788         .kind           = "gre",
1789         .maxtype        = IFLA_GRE_MAX,
1790         .policy         = ipgre_policy,
1791         .priv_size      = sizeof(struct ip_tunnel),
1792         .setup          = ipgre_tunnel_setup,
1793         .validate       = ipgre_tunnel_validate,
1794         .newlink        = ipgre_newlink,
1795         .changelink     = ipgre_changelink,
1796         .get_size       = ipgre_get_size,
1797         .fill_info      = ipgre_fill_info,
1798 };
1799
1800 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1801         .kind           = "gretap",
1802         .maxtype        = IFLA_GRE_MAX,
1803         .policy         = ipgre_policy,
1804         .priv_size      = sizeof(struct ip_tunnel),
1805         .setup          = ipgre_tap_setup,
1806         .validate       = ipgre_tap_validate,
1807         .newlink        = ipgre_newlink,
1808         .changelink     = ipgre_changelink,
1809         .get_size       = ipgre_get_size,
1810         .fill_info      = ipgre_fill_info,
1811 };
1812 #endif
1813
1814 /*
1815  *      And now the modules code and kernel interface.
1816  */
1817
1818 static int __init ipgre_init(void)
1819 {
1820         int err;
1821
1822         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1823
1824         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1825                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1826                 return -EAGAIN;
1827         }
1828
1829         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1830         if (err < 0)
1831                 goto gen_device_failed;
1832
1833 #ifndef GRE_IOCTL_ONLY
1834         err = rtnl_link_register(&ipgre_link_ops);
1835         if (err < 0)
1836                 goto rtnl_link_failed;
1837
1838         err = rtnl_link_register(&ipgre_tap_ops);
1839         if (err < 0)
1840                 goto tap_ops_failed;
1841 #endif
1842
1843 out:
1844         return err;
1845
1846 #ifndef GRE_IOCTL_ONLY
1847 tap_ops_failed:
1848         rtnl_link_unregister(&ipgre_link_ops);
1849 rtnl_link_failed:
1850         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1851 #endif
1852 gen_device_failed:
1853         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1854         goto out;
1855
1856 }
1857
1858 static void __exit ipgre_fini(void)
1859 {
1860 #ifndef GRE_IOCTL_ONLY
1861         rtnl_link_unregister(&ipgre_tap_ops);
1862         rtnl_link_unregister(&ipgre_link_ops);
1863 #endif
1864         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1865         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1866                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1867 }
1868
1869 module_init(ipgre_init);
1870 module_exit(ipgre_fini);
1871 MODULE_DESCRIPTION("GRE over IPv4 tunneling driver");
1872 MODULE_LICENSE("GPL");
1873 #ifndef GRE_IOCTL_ONLY
1874 MODULE_ALIAS_RTNL_LINK("gre");
1875 MODULE_ALIAS_RTNL_LINK("gretap");
1876 #endif
1877