2 * Distributed under the terms of the GNU GPL version 2.
3 * Copyright (c) 2007, 2008, 2009, 2010 Nicira Networks.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 /* Functions for executing flow actions. */
11 #include <linux/skbuff.h>
14 #include <linux/tcp.h>
15 #include <linux/udp.h>
16 #include <linux/in6.h>
17 #include <linux/if_vlan.h>
19 #include <net/checksum.h>
23 #include "openvswitch/datapath-protocol.h"
25 static struct sk_buff *
26 make_writable(struct sk_buff *skb, unsigned min_headroom, gfp_t gfp)
28 if (skb_shared(skb) || skb_cloned(skb)) {
30 unsigned headroom = max(min_headroom, skb_headroom(skb));
32 nskb = skb_copy_expand(skb, headroom, skb_tailroom(skb), gfp);
34 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
35 /* Before 2.6.24 these fields were not copied when
36 * doing an skb_copy_expand. */
37 nskb->ip_summed = skb->ip_summed;
38 nskb->csum = skb->csum;
40 #if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID)
41 /* These fields are copied in skb_clone but not in
42 * skb_copy or related functions. We need to manually
43 * copy them over here. */
44 nskb->proto_data_valid = skb->proto_data_valid;
45 nskb->proto_csum_blank = skb->proto_csum_blank;
51 unsigned int hdr_len = (skb_transport_offset(skb)
52 + sizeof(struct tcphdr));
53 if (pskb_may_pull(skb, min(hdr_len, skb->len)))
61 static struct sk_buff *
62 vlan_pull_tag(struct sk_buff *skb)
64 struct vlan_ethhdr *vh = vlan_eth_hdr(skb);
68 /* Verify we were given a vlan packet */
69 if (vh->h_vlan_proto != htons(ETH_P_8021Q))
72 memmove(skb->data + VLAN_HLEN, skb->data, 2 * VLAN_ETH_ALEN);
74 eh = (struct ethhdr *)skb_pull(skb, VLAN_HLEN);
76 skb->protocol = eh->h_proto;
77 skb->mac_header += VLAN_HLEN;
83 static struct sk_buff *
84 modify_vlan_tci(struct datapath *dp, struct sk_buff *skb,
85 struct odp_flow_key *key, const union odp_action *a,
86 int n_actions, gfp_t gfp)
90 if (a->type == ODPAT_SET_VLAN_VID) {
91 tci = ntohs(a->vlan_vid.vlan_vid);
93 key->dl_vlan = htons(tci & mask);
95 tci = a->vlan_pcp.vlan_pcp << 13;
99 skb = make_writable(skb, VLAN_HLEN, gfp);
101 return ERR_PTR(-ENOMEM);
103 if (skb->protocol == htons(ETH_P_8021Q)) {
104 /* Modify vlan id, but maintain other TCI values */
105 struct vlan_ethhdr *vh = vlan_eth_hdr(skb);
106 vh->h_vlan_TCI = htons((ntohs(vh->h_vlan_TCI) & ~mask) | tci);
108 /* Add vlan header */
110 /* Set up checksumming pointers for checksum-deferred packets
111 * on Xen. Otherwise, dev_queue_xmit() will try to do this
112 * when we send the packet out on the wire, and it will fail at
113 * that point because skb_checksum_setup() will not look inside
114 * an 802.1Q header. */
115 vswitch_skb_checksum_setup(skb);
117 /* GSO is not implemented for packets with an 802.1Q header, so
118 * we have to do segmentation before we add that header.
120 * GSO does work with hardware-accelerated VLAN tagging, but we
121 * can't use hardware-accelerated VLAN tagging since it
122 * requires the device to have a VLAN group configured (with
123 * e.g. vconfig(8)) and we don't do that.
125 * Having to do this here may be a performance loss, since we
126 * can't take advantage of TSO hardware support, although it
127 * does not make a measurable network performance difference
128 * for 1G Ethernet. Fixing that would require patching the
129 * kernel (either to add GSO support to the VLAN protocol or to
130 * support hardware-accelerated VLAN tagging without VLAN
131 * groups configured). */
132 if (skb_is_gso(skb)) {
133 struct sk_buff *segs;
135 segs = skb_gso_segment(skb, 0);
137 if (unlikely(IS_ERR(segs)))
138 return ERR_CAST(segs);
141 struct sk_buff *nskb = segs->next;
146 segs = __vlan_put_tag(segs, tci);
149 struct odp_flow_key segkey = *key;
150 err = execute_actions(dp, segs,
157 while ((segs = nskb)) {
166 } while (segs->next);
171 /* The hardware-accelerated version of vlan_put_tag() works
172 * only for a device that has a VLAN group configured (with
173 * e.g. vconfig(8)), so call the software-only version
174 * __vlan_put_tag() directly instead.
176 skb = __vlan_put_tag(skb, tci);
178 return ERR_PTR(-ENOMEM);
184 static struct sk_buff *strip_vlan(struct sk_buff *skb,
185 struct odp_flow_key *key, gfp_t gfp)
187 skb = make_writable(skb, 0, gfp);
190 key->dl_vlan = htons(ODP_VLAN_NONE);
195 static struct sk_buff *set_dl_addr(struct sk_buff *skb,
196 const struct odp_action_dl_addr *a,
199 skb = make_writable(skb, 0, gfp);
201 struct ethhdr *eh = eth_hdr(skb);
202 memcpy(a->type == ODPAT_SET_DL_SRC ? eh->h_source : eh->h_dest,
203 a->dl_addr, ETH_ALEN);
208 /* Updates 'sum', which is a field in 'skb''s data, given that a 4-byte field
209 * covered by the sum has been changed from 'from' to 'to'. If set,
210 * 'pseudohdr' indicates that the field is in the TCP or UDP pseudo-header.
211 * Based on nf_proto_csum_replace4. */
212 static void update_csum(__sum16 *sum, struct sk_buff *skb,
213 __be32 from, __be32 to, int pseudohdr)
215 __be32 diff[] = { ~from, to };
217 /* On older kernels, CHECKSUM_PARTIAL and CHECKSUM_COMPLETE are both defined
218 * as CHECKSUM_HW. However, we can make some inferences so that we can update
219 * the checksums appropriately. */
221 CSUM_PARTIAL, /* Partial checksum, skb->csum undefined. */
222 CSUM_PACKET, /* In-packet checksum, skb->csum undefined. */
223 CSUM_COMPLETE, /* In-packet checksum, skb->csum valid. */
226 csum_type = CSUM_PACKET;
228 /* Newer kernel, just map between kernel types and ours. */
229 if (skb->ip_summed == CHECKSUM_PARTIAL)
230 csum_type = CSUM_PARTIAL;
231 else if (skb->ip_summed == CHECKSUM_COMPLETE)
232 csum_type = CSUM_COMPLETE;
234 /* In theory this could be either CHECKSUM_PARTIAL or CHECKSUM_COMPLETE.
235 * However, we should only get CHECKSUM_PARTIAL packets from Xen, which
236 * uses some special fields to represent this (see below). Since we
237 * can only make one type work, pick the one that actually happens in
239 if (skb->ip_summed == CHECKSUM_HW)
240 csum_type = CSUM_COMPLETE;
242 #if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID)
243 /* Xen has a special way of representing CHECKSUM_PARTIAL on older
245 if (skb->proto_csum_blank)
246 csum_type = CSUM_PARTIAL;
249 if (csum_type != CSUM_PARTIAL) {
250 *sum = csum_fold(csum_partial((char *)diff, sizeof(diff),
251 ~csum_unfold(*sum)));
252 if (csum_type == CSUM_COMPLETE && pseudohdr)
253 skb->csum = ~csum_partial((char *)diff, sizeof(diff),
255 } else if (pseudohdr)
256 *sum = ~csum_fold(csum_partial((char *)diff, sizeof(diff),
260 static struct sk_buff *set_nw_addr(struct sk_buff *skb,
261 struct odp_flow_key *key,
262 const struct odp_action_nw_addr *a,
265 if (key->dl_type != htons(ETH_P_IP))
268 skb = make_writable(skb, 0, gfp);
270 struct iphdr *nh = ip_hdr(skb);
271 u32 *f = a->type == ODPAT_SET_NW_SRC ? &nh->saddr : &nh->daddr;
273 u32 new = a->nw_addr;
275 if (key->nw_proto == IPPROTO_TCP) {
276 struct tcphdr *th = tcp_hdr(skb);
277 update_csum(&th->check, skb, old, new, 1);
278 } else if (key->nw_proto == IPPROTO_UDP) {
279 struct udphdr *th = udp_hdr(skb);
280 update_csum(&th->check, skb, old, new, 1);
282 update_csum(&nh->check, skb, old, new, 0);
288 static struct sk_buff *
289 set_tp_port(struct sk_buff *skb, struct odp_flow_key *key,
290 const struct odp_action_tp_port *a,
295 if (key->dl_type != htons(ETH_P_IP))
298 if (key->nw_proto == IPPROTO_TCP)
299 check_ofs = offsetof(struct tcphdr, check);
300 else if (key->nw_proto == IPPROTO_UDP)
301 check_ofs = offsetof(struct udphdr, check);
305 skb = make_writable(skb, 0, gfp);
307 struct udphdr *th = udp_hdr(skb);
308 u16 *f = a->type == ODPAT_SET_TP_SRC ? &th->source : &th->dest;
310 u16 new = a->tp_port;
311 update_csum((u16*)(skb_transport_header(skb) + check_ofs),
318 static inline unsigned packet_length(const struct sk_buff *skb)
320 unsigned length = skb->len - ETH_HLEN;
321 if (skb->protocol == htons(ETH_P_8021Q))
326 int dp_xmit_skb(struct sk_buff *skb)
328 struct datapath *dp = skb->dev->br_port->dp;
331 if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) {
332 printk(KERN_WARNING "%s: dropped over-mtu packet: %d > %d\n",
333 dp_name(dp), packet_length(skb), skb->dev->mtu);
338 forward_ip_summed(skb);
345 do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
347 struct net_bridge_port *p;
348 struct net_device *dev;
353 p = dp->ports[out_port];
357 dev = skb->dev = p->dev;
359 dp_dev_recv(dev, skb);
368 /* Never consumes 'skb'. Returns a port that 'skb' should be sent to, -1 if
370 static int output_group(struct datapath *dp, __u16 group,
371 struct sk_buff *skb, gfp_t gfp)
373 struct dp_port_group *g = rcu_dereference(dp->groups[group]);
379 for (i = 0; i < g->n_ports; i++) {
380 struct net_bridge_port *p = dp->ports[g->ports[i]];
381 if (!p || skb->dev == p->dev)
383 if (prev_port != -1) {
384 struct sk_buff *clone = skb_clone(skb, gfp);
387 do_output(dp, clone, prev_port);
389 prev_port = p->port_no;
395 output_control(struct datapath *dp, struct sk_buff *skb, u32 arg, gfp_t gfp)
397 skb = skb_clone(skb, gfp);
400 return dp_output_control(dp, skb, _ODPL_ACTION_NR, arg);
403 /* Execute a list of actions against 'skb'. */
404 int execute_actions(struct datapath *dp, struct sk_buff *skb,
405 struct odp_flow_key *key,
406 const union odp_action *a, int n_actions,
409 /* Every output action needs a separate clone of 'skb', but the common
410 * case is just a single output action, so that doing a clone and
411 * then freeing the original skbuff is wasteful. So the following code
412 * is slightly obscure just to avoid that. */
415 for (; n_actions > 0; a++, n_actions--) {
416 WARN_ON_ONCE(skb_shared(skb));
417 if (prev_port != -1) {
418 do_output(dp, skb_clone(skb, gfp), prev_port);
424 prev_port = a->output.port;
427 case ODPAT_OUTPUT_GROUP:
428 prev_port = output_group(dp, a->output_group.group,
432 case ODPAT_CONTROLLER:
433 err = output_control(dp, skb, a->controller.arg, gfp);
440 case ODPAT_SET_VLAN_VID:
441 case ODPAT_SET_VLAN_PCP:
442 skb = modify_vlan_tci(dp, skb, key, a, n_actions, gfp);
447 case ODPAT_STRIP_VLAN:
448 skb = strip_vlan(skb, key, gfp);
451 case ODPAT_SET_DL_SRC:
452 case ODPAT_SET_DL_DST:
453 skb = set_dl_addr(skb, &a->dl_addr, gfp);
456 case ODPAT_SET_NW_SRC:
457 case ODPAT_SET_NW_DST:
458 skb = set_nw_addr(skb, key, &a->nw_addr, gfp);
461 case ODPAT_SET_TP_SRC:
462 case ODPAT_SET_TP_DST:
463 skb = set_tp_port(skb, key, &a->tp_port, gfp);
470 do_output(dp, skb, prev_port);