2 * Copyright (c) 2010, 2011 Nicira Networks.
3 * Distributed under the terms of the GNU GPL version 2.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 #include <linux/version.h>
12 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
17 #include <linux/list.h>
18 #include <linux/net.h>
21 #include <net/inet_frag.h>
23 #include <net/protocol.h>
28 #include "vport-generic.h"
30 #define CAPWAP_SRC_PORT 58881
31 #define CAPWAP_DST_PORT 58882
33 #define CAPWAP_FRAG_TIMEOUT (30 * HZ)
34 #define CAPWAP_FRAG_MAX_MEM (256 * 1024)
35 #define CAPWAP_FRAG_PRUNE_MEM (192 *1024)
36 #define CAPWAP_FRAG_SECRET_INTERVAL (10 * 60 * HZ)
39 * The CAPWAP header is a mess, with all kinds of odd size bit fields that
40 * cross byte boundaries, which are difficult to represent correctly in
41 * various byte orderings. Luckily we only care about a few permutations, so
42 * statically create them and we can do very fast parsing by checking all 12
45 #define CAPWAP_BEGIN_HLEN __cpu_to_be32(0x00100000)
46 #define CAPWAP_BEGIN_WBID __cpu_to_be32(0x00000200)
47 #define CAPWAP_BEGIN_FRAG __cpu_to_be32(0x00000080)
48 #define CAPWAP_BEGIN_LAST __cpu_to_be32(0x00000040)
50 #define NO_FRAG_HDR (CAPWAP_BEGIN_HLEN | CAPWAP_BEGIN_WBID)
51 #define FRAG_HDR (NO_FRAG_HDR | CAPWAP_BEGIN_FRAG)
52 #define FRAG_LAST_HDR (FRAG_HDR | CAPWAP_BEGIN_LAST)
60 static inline struct capwaphdr *capwap_hdr(const struct sk_buff *skb)
62 return (struct capwaphdr *)(udp_hdr(skb) + 1);
66 * The fragment offset is actually the high 13 bits of the last 16 bit field,
67 * so we would normally need to right shift 3 places. However, it stores the
68 * offset in 8 byte chunks, which would involve a 3 place left shift. So we
69 * just mask off the last 3 bits and be done with it.
71 #define FRAG_OFF_MASK (~0x7U)
73 #define CAPWAP_HLEN (sizeof(struct udphdr) + sizeof(struct capwaphdr))
82 struct inet_frag_queue ifq;
83 struct frag_match match;
89 #define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)
91 static struct sk_buff *fragment(struct sk_buff *, const struct vport *,
93 static void defrag_init(void);
94 static void defrag_exit(void);
95 static struct sk_buff *defrag(struct sk_buff *, bool frag_last);
97 static void capwap_frag_init(struct inet_frag_queue *, void *match);
98 static unsigned int capwap_frag_hash(struct inet_frag_queue *);
99 static int capwap_frag_match(struct inet_frag_queue *, void *match);
100 static void capwap_frag_expire(unsigned long ifq);
102 static struct inet_frags frag_state = {
103 .constructor = capwap_frag_init,
104 .qsize = sizeof(struct frag_queue),
105 .hashfn = capwap_frag_hash,
106 .match = capwap_frag_match,
107 .frag_expire = capwap_frag_expire,
108 .secret_interval = CAPWAP_FRAG_SECRET_INTERVAL,
110 static struct netns_frags frag_netns_state = {
111 .timeout = CAPWAP_FRAG_TIMEOUT,
112 .high_thresh = CAPWAP_FRAG_MAX_MEM,
113 .low_thresh = CAPWAP_FRAG_PRUNE_MEM,
116 static struct socket *capwap_rcv_socket;
118 static int capwap_hdr_len(const struct tnl_mutable_config *mutable)
120 /* CAPWAP has no checksums. */
121 if (mutable->flags & TNL_F_CSUM)
124 /* CAPWAP has no keys, so check that the configuration for keys is the
125 * default if no key-specific attributes are used.
127 if ((mutable->flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) !=
128 (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION))
134 static void capwap_build_header(const struct vport *vport,
135 const struct tnl_mutable_config *mutable,
138 struct udphdr *udph = header;
139 struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
141 udph->source = htons(CAPWAP_SRC_PORT);
142 udph->dest = htons(CAPWAP_DST_PORT);
145 cwh->begin = NO_FRAG_HDR;
150 static struct sk_buff *capwap_update_header(const struct vport *vport,
151 const struct tnl_mutable_config *mutable,
152 struct dst_entry *dst,
155 struct udphdr *udph = udp_hdr(skb);
157 udph->len = htons(skb->len - skb_transport_offset(skb));
159 if (unlikely(skb->len - skb_network_offset(skb) > dst_mtu(dst)))
160 skb = fragment(skb, vport, dst);
165 static inline struct sk_buff *process_capwap_proto(struct sk_buff *skb)
167 struct capwaphdr *cwh = capwap_hdr(skb);
169 if (likely(cwh->begin == NO_FRAG_HDR))
171 else if (cwh->begin == FRAG_HDR)
172 return defrag(skb, false);
173 else if (cwh->begin == FRAG_LAST_HDR)
174 return defrag(skb, true);
177 pr_warn("unparsable packet receive on capwap socket\n");
184 /* Called with rcu_read_lock and BH disabled. */
185 static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
188 const struct tnl_mutable_config *mutable;
191 if (unlikely(!pskb_may_pull(skb, CAPWAP_HLEN + ETH_HLEN)))
194 __skb_pull(skb, CAPWAP_HLEN);
195 skb_postpull_rcsum(skb, skb_transport_header(skb), CAPWAP_HLEN + ETH_HLEN);
197 skb = process_capwap_proto(skb);
202 vport = tnl_find_port(iph->daddr, iph->saddr, 0,
203 TNL_T_PROTO_CAPWAP | TNL_T_KEY_EXACT, &mutable);
204 if (unlikely(!vport)) {
205 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
218 static const struct tnl_ops capwap_tnl_ops = {
219 .tunnel_type = TNL_T_PROTO_CAPWAP,
220 .ipproto = IPPROTO_UDP,
221 .hdr_len = capwap_hdr_len,
222 .build_header = capwap_build_header,
223 .update_header = capwap_update_header,
226 static struct vport *capwap_create(const struct vport_parms *parms)
228 return tnl_create(parms, &capwap_vport_ops, &capwap_tnl_ops);
231 /* Random value. Irrelevant as long as it's not 0 since we set the handler. */
232 #define UDP_ENCAP_CAPWAP 10
233 static int capwap_init(void)
236 struct sockaddr_in sin;
238 err = sock_create(AF_INET, SOCK_DGRAM, 0, &capwap_rcv_socket);
242 sin.sin_family = AF_INET;
243 sin.sin_addr.s_addr = htonl(INADDR_ANY);
244 sin.sin_port = htons(CAPWAP_DST_PORT);
246 err = kernel_bind(capwap_rcv_socket, (struct sockaddr *)&sin,
247 sizeof(struct sockaddr_in));
251 udp_sk(capwap_rcv_socket->sk)->encap_type = UDP_ENCAP_CAPWAP;
252 udp_sk(capwap_rcv_socket->sk)->encap_rcv = capwap_rcv;
259 sock_release(capwap_rcv_socket);
261 pr_warn("cannot register capwap protocol handler\n");
265 static void capwap_exit(void)
268 sock_release(capwap_rcv_socket);
271 static void copy_skb_metadata(struct sk_buff *from, struct sk_buff *to)
273 to->pkt_type = from->pkt_type;
274 to->priority = from->priority;
275 to->protocol = from->protocol;
276 skb_dst_set(to, dst_clone(skb_dst(from)));
278 to->mark = from->mark;
281 skb_set_owner_w(to, from->sk);
283 #ifdef CONFIG_NET_SCHED
284 to->tc_index = from->tc_index;
286 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
287 to->ipvs_property = from->ipvs_property;
289 skb_copy_secmark(to, from);
292 static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
293 struct dst_entry *dst)
295 struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
296 unsigned int hlen = skb_transport_offset(skb) + CAPWAP_HLEN;
297 unsigned int headroom;
298 unsigned int max_frame_len = dst_mtu(dst) + skb_network_offset(skb);
299 struct sk_buff *result = NULL, *list_cur = NULL;
300 unsigned int remaining;
304 if (hlen + ~FRAG_OFF_MASK + 1 > max_frame_len) {
306 pr_warn("capwap link mtu (%d) is less than minimum packet (%d)\n",
308 hlen - skb_network_offset(skb) + ~FRAG_OFF_MASK + 1);
312 remaining = skb->len - hlen;
314 frag_id = htons(atomic_inc_return(&tnl_vport->frag_id));
316 headroom = dst->header_len + 16;
317 if (!skb_network_offset(skb))
318 headroom += LL_RESERVED_SPACE(dst->dev);
321 struct sk_buff *skb2;
324 struct capwaphdr *cwh;
326 frag_size = min(remaining, max_frame_len - hlen);
327 if (remaining > frag_size)
328 frag_size &= FRAG_OFF_MASK;
330 skb2 = alloc_skb(headroom + hlen + frag_size, GFP_ATOMIC);
334 skb_reserve(skb2, headroom);
335 __skb_put(skb2, hlen + frag_size);
337 if (skb_network_offset(skb))
338 skb_reset_mac_header(skb2);
339 skb_set_network_header(skb2, skb_network_offset(skb));
340 skb_set_transport_header(skb2, skb_transport_offset(skb));
342 /* Copy (Ethernet)/IP/UDP/CAPWAP header. */
343 copy_skb_metadata(skb, skb2);
344 skb_copy_from_linear_data(skb, skb2->data, hlen);
346 /* Copy this data chunk. */
347 if (skb_copy_bits(skb, hlen + offset, skb2->data + hlen, frag_size))
350 udph = udp_hdr(skb2);
351 udph->len = htons(skb2->len - skb_transport_offset(skb2));
353 cwh = capwap_hdr(skb2);
354 if (remaining > frag_size)
355 cwh->begin = FRAG_HDR;
357 cwh->begin = FRAG_LAST_HDR;
358 cwh->frag_id = frag_id;
359 cwh->frag_off = htons(offset);
362 list_cur->next = skb2;
365 result = list_cur = skb2;
368 remaining -= frag_size;
374 tnl_free_linked_skbs(result);
380 /* All of the following functions relate to fragmentation reassembly. */
382 static inline struct frag_queue *ifq_cast(struct inet_frag_queue *ifq)
384 return container_of(ifq, struct frag_queue, ifq);
387 static u32 frag_hash(struct frag_match *match)
389 return jhash_3words((__force u16)match->id, (__force u32)match->saddr,
390 (__force u32)match->daddr,
391 frag_state.rnd) & (INETFRAGS_HASHSZ - 1);
394 static struct frag_queue *queue_find(struct frag_match *match)
396 struct inet_frag_queue *ifq;
398 read_lock(&frag_state.lock);
400 ifq = inet_frag_find(&frag_netns_state, &frag_state, match, frag_hash(match));
404 /* Unlock happens inside inet_frag_find(). */
406 return ifq_cast(ifq);
409 static struct sk_buff *frag_reasm(struct frag_queue *fq, struct net_device *dev)
411 struct sk_buff *head = fq->ifq.fragments;
412 struct sk_buff *frag;
414 /* Succeed or fail, we're done with this queue. */
415 inet_frag_kill(&fq->ifq, &frag_state);
417 if (fq->ifq.len > 65535)
420 /* Can't have the head be a clone. */
421 if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
425 * We're about to build frag list for this SKB. If it already has a
426 * frag list, alloc a new SKB and put the existing frag list there.
428 if (skb_shinfo(head)->frag_list) {
432 frag = alloc_skb(0, GFP_ATOMIC);
436 frag->next = head->next;
438 skb_shinfo(frag)->frag_list = skb_shinfo(head)->frag_list;
439 skb_shinfo(head)->frag_list = NULL;
441 for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
442 paged_len += skb_shinfo(head)->frags[i].size;
443 frag->len = frag->data_len = head->data_len - paged_len;
444 head->data_len -= frag->len;
445 head->len -= frag->len;
447 frag->ip_summed = head->ip_summed;
448 atomic_add(frag->truesize, &fq->ifq.net->mem);
451 skb_shinfo(head)->frag_list = head->next;
452 atomic_sub(head->truesize, &fq->ifq.net->mem);
454 /* Properly account for data in various packets. */
455 for (frag = head->next; frag; frag = frag->next) {
456 head->data_len += frag->len;
457 head->len += frag->len;
459 if (head->ip_summed != frag->ip_summed)
460 head->ip_summed = CHECKSUM_NONE;
461 else if (head->ip_summed == CHECKSUM_COMPLETE)
462 head->csum = csum_add(head->csum, frag->csum);
464 head->truesize += frag->truesize;
465 atomic_sub(frag->truesize, &fq->ifq.net->mem);
470 head->tstamp = fq->ifq.stamp;
471 fq->ifq.fragments = NULL;
476 static struct sk_buff *frag_queue(struct frag_queue *fq, struct sk_buff *skb,
477 u16 offset, bool frag_last)
479 struct sk_buff *prev, *next;
480 struct net_device *dev;
483 if (fq->ifq.last_in & INET_FRAG_COMPLETE)
489 end = offset + skb->len;
493 * Last fragment, shouldn't already have data past our end or
494 * have another last fragment.
496 if (end < fq->ifq.len || fq->ifq.last_in & INET_FRAG_LAST_IN)
499 fq->ifq.last_in |= INET_FRAG_LAST_IN;
502 /* Fragments should align to 8 byte chunks. */
503 if (end & ~FRAG_OFF_MASK)
506 if (end > fq->ifq.len) {
508 * Shouldn't have data past the end, if we already
511 if (fq->ifq.last_in & INET_FRAG_LAST_IN)
518 /* Find where we fit in. */
520 for (next = fq->ifq.fragments; next != NULL; next = next->next) {
521 if (FRAG_CB(next)->offset >= offset)
527 * Overlapping fragments aren't allowed. We shouldn't start before
528 * the end of the previous fragment.
530 if (prev && FRAG_CB(prev)->offset + prev->len > offset)
533 /* We also shouldn't end after the beginning of the next fragment. */
534 if (next && end > FRAG_CB(next)->offset)
537 FRAG_CB(skb)->offset = offset;
539 /* Link into list. */
544 fq->ifq.fragments = skb;
549 fq->ifq.stamp = skb->tstamp;
550 fq->ifq.meat += skb->len;
551 atomic_add(skb->truesize, &fq->ifq.net->mem);
553 fq->ifq.last_in |= INET_FRAG_FIRST_IN;
555 /* If we have all fragments do reassembly. */
556 if (fq->ifq.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
557 fq->ifq.meat == fq->ifq.len)
558 return frag_reasm(fq, dev);
560 write_lock(&frag_state.lock);
561 list_move_tail(&fq->ifq.lru_list, &fq->ifq.net->lru_list);
562 write_unlock(&frag_state.lock);
571 static struct sk_buff *defrag(struct sk_buff *skb, bool frag_last)
573 struct iphdr *iph = ip_hdr(skb);
574 struct capwaphdr *cwh = capwap_hdr(skb);
575 struct frag_match match;
577 struct frag_queue *fq;
579 if (atomic_read(&frag_netns_state.mem) > frag_netns_state.high_thresh)
580 inet_frag_evictor(&frag_netns_state, &frag_state);
582 match.daddr = iph->daddr;
583 match.saddr = iph->saddr;
584 match.id = cwh->frag_id;
585 frag_off = ntohs(cwh->frag_off) & FRAG_OFF_MASK;
587 fq = queue_find(&match);
589 spin_lock(&fq->ifq.lock);
590 skb = frag_queue(fq, skb, frag_off, frag_last);
591 spin_unlock(&fq->ifq.lock);
593 inet_frag_put(&fq->ifq, &frag_state);
602 static void defrag_init(void)
604 inet_frags_init(&frag_state);
605 inet_frags_init_net(&frag_netns_state);
608 static void defrag_exit(void)
610 inet_frags_exit_net(&frag_netns_state, &frag_state);
611 inet_frags_fini(&frag_state);
614 static void capwap_frag_init(struct inet_frag_queue *ifq, void *match_)
616 struct frag_match *match = match_;
618 ifq_cast(ifq)->match = *match;
621 static unsigned int capwap_frag_hash(struct inet_frag_queue *ifq)
623 return frag_hash(&ifq_cast(ifq)->match);
626 static int capwap_frag_match(struct inet_frag_queue *ifq, void *a_)
628 struct frag_match *a = a_;
629 struct frag_match *b = &ifq_cast(ifq)->match;
631 return a->id == b->id && a->saddr == b->saddr && a->daddr == b->daddr;
634 /* Run when the timeout for a given queue expires. */
635 static void capwap_frag_expire(unsigned long ifq)
637 struct frag_queue *fq;
639 fq = ifq_cast((struct inet_frag_queue *)ifq);
641 spin_lock(&fq->ifq.lock);
643 if (!(fq->ifq.last_in & INET_FRAG_COMPLETE))
644 inet_frag_kill(&fq->ifq, &frag_state);
646 spin_unlock(&fq->ifq.lock);
647 inet_frag_put(&fq->ifq, &frag_state);
650 const struct vport_ops capwap_vport_ops = {
651 .type = ODP_VPORT_TYPE_CAPWAP,
652 .flags = VPORT_F_GEN_STATS,
655 .create = capwap_create,
656 .destroy = tnl_destroy,
657 .set_mtu = tnl_set_mtu,
658 .set_addr = tnl_set_addr,
659 .get_name = tnl_get_name,
660 .get_addr = tnl_get_addr,
661 .get_options = tnl_get_options,
662 .set_options = tnl_set_options,
663 .get_dev_flags = vport_gen_get_dev_flags,
664 .is_running = vport_gen_is_running,
665 .get_operstate = vport_gen_get_operstate,
666 .get_mtu = tnl_get_mtu,
670 #endif /* Linux kernel >= 2.6.26 */