2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
96 #define TC_RTAB_SIZE 1024
99 static struct rtnetlink_notifier netdev_linux_cache_notifier;
100 static int cache_notifier_refcount;
103 VALID_IFINDEX = 1 << 0,
104 VALID_ETHERADDR = 1 << 1,
108 VALID_CARRIER = 1 << 5,
109 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
110 VALID_POLICING = 1 << 7,
111 VALID_HAVE_VPORT_STATS = 1 << 8
119 /* Traffic control. */
121 /* An instance of a traffic control class. Always associated with a particular
124 * Each TC implementation subclasses this with whatever additional data it
127 const struct tc_ops *ops;
128 struct hmap queues; /* Contains "struct tc_queue"s.
129 * Read by generic TC layer.
130 * Written only by TC implementation. */
133 /* One traffic control queue.
135 * Each TC implementation subclasses this with whatever additional data it
138 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
139 unsigned int queue_id; /* OpenFlow queue ID. */
142 /* A particular kind of traffic control. Each implementation generally maps to
143 * one particular Linux qdisc class.
145 * The functions below return 0 if successful or a positive errno value on
146 * failure, except where otherwise noted. All of them must be provided, except
147 * where otherwise noted. */
149 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
150 * This is null for tc_ops_default and tc_ops_other, for which there are no
151 * appropriate values. */
152 const char *linux_name;
154 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
155 const char *ovs_name;
157 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
158 * queues. The queues are numbered 0 through n_queues - 1. */
159 unsigned int n_queues;
161 /* Called to install this TC class on 'netdev'. The implementation should
162 * make the Netlink calls required to set up 'netdev' with the right qdisc
163 * and configure it according to 'details'. The implementation may assume
164 * that the current qdisc is the default; that is, there is no need for it
165 * to delete the current qdisc before installing itself.
167 * The contents of 'details' should be documented as valid for 'ovs_name'
168 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
169 * (which is built as ovs-vswitchd.conf.db(8)).
171 * This function must return 0 if and only if it sets 'netdev->tc' to an
172 * initialized 'struct tc'.
174 * (This function is null for tc_ops_other, which cannot be installed. For
175 * other TC classes it should always be nonnull.) */
176 int (*tc_install)(struct netdev *netdev, const struct shash *details);
178 /* Called when the netdev code determines (through a Netlink query) that
179 * this TC class's qdisc is installed on 'netdev', but we didn't install
180 * it ourselves and so don't know any of the details.
182 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
183 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
184 * implementation should parse the other attributes of 'nlmsg' as
185 * necessary to determine its configuration. If necessary it should also
186 * use Netlink queries to determine the configuration of queues on
189 * This function must return 0 if and only if it sets 'netdev->tc' to an
190 * initialized 'struct tc'. */
191 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
193 /* Destroys the data structures allocated by the implementation as part of
194 * 'tc'. (This includes destroying 'tc->queues' by calling
197 * The implementation should not need to perform any Netlink calls. If
198 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
199 * (But it may not be desirable.)
201 * This function may be null if 'tc' is trivial. */
202 void (*tc_destroy)(struct tc *tc);
204 /* Retrieves details of 'netdev->tc' configuration into 'details'.
206 * The implementation should not need to perform any Netlink calls, because
207 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
208 * cached the configuration.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
212 * (which is built as ovs-vswitchd.conf.db(8)).
214 * This function may be null if 'tc' is not configurable.
216 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
218 /* Reconfigures 'netdev->tc' according to 'details', performing any
219 * required Netlink calls to complete the reconfiguration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_set)(struct netdev *, const struct shash *details);
229 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
230 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "Queue" table in
234 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
236 * The implementation should not need to perform any Netlink calls, because
237 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
238 * cached the queue configuration.
240 * This function may be null if 'tc' does not have queues ('n_queues' is
242 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
243 struct shash *details);
245 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
246 * 'details', perfoming any required Netlink calls to complete the
247 * reconfiguration. The caller ensures that 'queue_id' is less than
250 * The contents of 'details' should be documented as valid for 'ovs_name'
251 * in the "other_config" column in the "Queue" table in
252 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
254 * This function may be null if 'tc' does not have queues or its queues are
255 * not configurable. */
256 int (*class_set)(struct netdev *, unsigned int queue_id,
257 const struct shash *details);
259 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
260 * tc_queue's within 'netdev->tc->queues'.
262 * This function may be null if 'tc' does not have queues or its queues
263 * cannot be deleted. */
264 int (*class_delete)(struct netdev *, struct tc_queue *queue);
266 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
267 * 'struct tc_queue's within 'netdev->tc->queues'.
269 * On success, initializes '*stats'.
271 * This function may be null if 'tc' does not have queues or if it cannot
272 * report queue statistics. */
273 int (*class_get_stats)(const struct netdev *netdev,
274 const struct tc_queue *queue,
275 struct netdev_queue_stats *stats);
277 /* Extracts queue stats from 'nlmsg', which is a response to a
278 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_dump_stats)(const struct netdev *netdev,
283 const struct ofpbuf *nlmsg,
284 netdev_dump_queue_stats_cb *cb, void *aux);
288 tc_init(struct tc *tc, const struct tc_ops *ops)
291 hmap_init(&tc->queues);
295 tc_destroy(struct tc *tc)
297 hmap_destroy(&tc->queues);
300 static const struct tc_ops tc_ops_htb;
301 static const struct tc_ops tc_ops_hfsc;
302 static const struct tc_ops tc_ops_default;
303 static const struct tc_ops tc_ops_other;
305 static const struct tc_ops *tcs[] = {
306 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
307 &tc_ops_hfsc, /* Hierarchical fair service curve. */
308 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
309 &tc_ops_other, /* Some other qdisc. */
313 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
314 static unsigned int tc_get_major(unsigned int handle);
315 static unsigned int tc_get_minor(unsigned int handle);
317 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
318 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
319 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
321 static struct tcmsg *tc_make_request(const struct netdev *, int type,
322 unsigned int flags, struct ofpbuf *);
323 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
325 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
326 struct nlattr **options);
327 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
328 struct nlattr **options,
329 struct netdev_queue_stats *);
330 static int tc_query_class(const struct netdev *,
331 unsigned int handle, unsigned int parent,
332 struct ofpbuf **replyp);
333 static int tc_delete_class(const struct netdev *, unsigned int handle);
335 static int tc_del_qdisc(struct netdev *netdev);
336 static int tc_query_qdisc(const struct netdev *netdev);
338 static int tc_calc_cell_log(unsigned int mtu);
339 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
340 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
341 const struct tc_ratespec *rate);
342 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
344 struct netdev_dev_linux {
345 struct netdev_dev netdev_dev;
347 struct shash_node *shash_node;
348 unsigned int cache_valid;
349 unsigned int change_seq;
351 bool miimon; /* Link status of last poll. */
352 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
353 struct timer miimon_timer;
355 /* The following are figured out "on demand" only. They are only valid
356 * when the corresponding VALID_* bit in 'cache_valid' is set. */
358 uint8_t etheraddr[ETH_ADDR_LEN];
359 struct in_addr address, netmask;
363 bool is_internal; /* Is this an openvswitch internal device? */
364 bool is_tap; /* Is this a tuntap device? */
365 uint32_t kbits_rate; /* Policing data. */
366 uint32_t kbits_burst;
367 bool have_vport_stats;
371 struct tap_state tap;
375 struct netdev_linux {
376 struct netdev netdev;
380 /* Sockets used for ioctl operations. */
381 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
383 /* A Netlink routing socket that is not subscribed to any multicast groups. */
384 static struct nl_sock *rtnl_sock;
386 struct netdev_linux_notifier {
387 struct netdev_notifier notifier;
391 static struct shash netdev_linux_notifiers =
392 SHASH_INITIALIZER(&netdev_linux_notifiers);
393 static struct rtnetlink_notifier netdev_linux_poll_notifier;
395 /* This is set pretty low because we probably won't learn anything from the
396 * additional log messages. */
397 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
399 static int netdev_linux_init(void);
401 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
402 int cmd, const char *cmd_name);
403 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
404 const char *cmd_name);
405 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
406 int cmd, const char *cmd_name);
407 static int get_flags(const struct netdev *, int *flagsp);
408 static int set_flags(struct netdev *, int flags);
409 static int do_get_ifindex(const char *netdev_name);
410 static int get_ifindex(const struct netdev *, int *ifindexp);
411 static int do_set_addr(struct netdev *netdev,
412 int ioctl_nr, const char *ioctl_name,
413 struct in_addr addr);
414 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
415 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
416 const uint8_t[ETH_ADDR_LEN]);
417 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
418 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
419 static int af_packet_sock(void);
420 static void poll_notify(struct list *);
421 static void netdev_linux_miimon_run(void);
422 static void netdev_linux_miimon_wait(void);
425 is_netdev_linux_class(const struct netdev_class *netdev_class)
427 return netdev_class->init == netdev_linux_init;
430 static struct netdev_dev_linux *
431 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
433 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
434 assert(is_netdev_linux_class(netdev_class));
436 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
439 static struct netdev_linux *
440 netdev_linux_cast(const struct netdev *netdev)
442 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
443 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
444 assert(is_netdev_linux_class(netdev_class));
446 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
450 netdev_linux_init(void)
452 static int status = -1;
454 /* Create AF_INET socket. */
455 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
456 status = af_inet_sock >= 0 ? 0 : errno;
458 VLOG_ERR("failed to create inet socket: %s", strerror(status));
461 /* Create rtnetlink socket. */
463 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
465 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
474 netdev_linux_run(void)
476 rtnetlink_link_notifier_run();
477 netdev_linux_miimon_run();
481 netdev_linux_wait(void)
483 rtnetlink_link_notifier_wait();
484 netdev_linux_miimon_wait();
488 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
491 if (!dev->change_seq) {
494 dev->cache_valid = 0;
498 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
499 void *aux OVS_UNUSED)
501 struct netdev_dev_linux *dev;
503 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
505 const struct netdev_class *netdev_class =
506 netdev_dev_get_class(base_dev);
508 if (is_netdev_linux_class(netdev_class)) {
509 dev = netdev_dev_linux_cast(base_dev);
510 netdev_dev_linux_changed(dev);
514 struct shash device_shash;
515 struct shash_node *node;
517 shash_init(&device_shash);
518 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
519 SHASH_FOR_EACH (node, &device_shash) {
521 netdev_dev_linux_changed(dev);
523 shash_destroy(&device_shash);
527 /* Creates system and internal devices. */
529 netdev_linux_create(const struct netdev_class *class,
530 const char *name, const struct shash *args,
531 struct netdev_dev **netdev_devp)
533 struct netdev_dev_linux *netdev_dev;
536 if (!shash_is_empty(args)) {
537 VLOG_WARN("%s: arguments for %s devices should be empty",
541 if (!cache_notifier_refcount) {
542 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
543 netdev_linux_cache_cb, NULL);
548 cache_notifier_refcount++;
550 netdev_dev = xzalloc(sizeof *netdev_dev);
551 netdev_dev->change_seq = 1;
552 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
554 *netdev_devp = &netdev_dev->netdev_dev;
558 /* For most types of netdevs we open the device for each call of
559 * netdev_open(). However, this is not the case with tap devices,
560 * since it is only possible to open the device once. In this
561 * situation we share a single file descriptor, and consequently
562 * buffers, across all readers. Therefore once data is read it will
563 * be unavailable to other reads for tap devices. */
565 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
566 const char *name, const struct shash *args,
567 struct netdev_dev **netdev_devp)
569 struct netdev_dev_linux *netdev_dev;
570 struct tap_state *state;
571 static const char tap_dev[] = "/dev/net/tun";
575 if (!shash_is_empty(args)) {
576 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
579 netdev_dev = xzalloc(sizeof *netdev_dev);
580 state = &netdev_dev->state.tap;
582 /* Open tap device. */
583 state->fd = open(tap_dev, O_RDWR);
586 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
590 /* Create tap device. */
591 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
592 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
593 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
594 VLOG_WARN("%s: creating tap device failed: %s", name,
600 /* Make non-blocking. */
601 error = set_nonblocking(state->fd);
606 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
607 *netdev_devp = &netdev_dev->netdev_dev;
616 destroy_tap(struct netdev_dev_linux *netdev_dev)
618 struct tap_state *state = &netdev_dev->state.tap;
620 if (state->fd >= 0) {
625 /* Destroys the netdev device 'netdev_dev_'. */
627 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
629 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
630 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
632 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
633 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
636 if (class == &netdev_linux_class || class == &netdev_internal_class) {
637 cache_notifier_refcount--;
639 if (!cache_notifier_refcount) {
640 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
642 } else if (class == &netdev_tap_class) {
643 destroy_tap(netdev_dev);
652 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
653 struct netdev **netdevp)
655 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
656 struct netdev_linux *netdev;
657 enum netdev_flags flags;
660 /* Allocate network device. */
661 netdev = xzalloc(sizeof *netdev);
663 netdev_init(&netdev->netdev, netdev_dev_);
665 /* Verify that the device really exists, by attempting to read its flags.
666 * (The flags might be cached, in which case this won't actually do an
669 * Don't do this for "internal" netdevs, though, because those have to be
670 * created as netdev objects before they exist in the kernel, because
671 * creating them in the kernel happens by passing a netdev object to
672 * dpif_port_add(). */
673 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
674 error = netdev_get_flags(&netdev->netdev, &flags);
675 if (error == ENODEV) {
680 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
681 !netdev_dev->state.tap.opened) {
683 /* We assume that the first user of the tap device is the primary user
684 * and give them the tap FD. Subsequent users probably just expect
685 * this to be a system device so open it normally to avoid send/receive
686 * directions appearing to be reversed. */
687 netdev->fd = netdev_dev->state.tap.fd;
688 netdev_dev->state.tap.opened = true;
689 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
690 struct sockaddr_ll sll;
694 /* Create file descriptor. */
695 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
696 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
698 netdev->fd = socket(PF_PACKET, SOCK_RAW,
699 (OVS_FORCE int) htons(protocol));
700 if (netdev->fd < 0) {
705 /* Set non-blocking mode. */
706 error = set_nonblocking(netdev->fd);
711 /* Get ethernet device index. */
712 error = get_ifindex(&netdev->netdev, &ifindex);
717 /* Bind to specific ethernet device. */
718 memset(&sll, 0, sizeof sll);
719 sll.sll_family = AF_PACKET;
720 sll.sll_ifindex = ifindex;
722 (struct sockaddr *) &sll, sizeof sll) < 0) {
724 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
729 /* Between the socket() and bind() calls above, the socket receives all
730 * packets of the requested type on all system interfaces. We do not
731 * want to receive that data, but there is no way to avoid it. So we
732 * must now drain out the receive queue. */
733 error = drain_rcvbuf(netdev->fd);
739 *netdevp = &netdev->netdev;
743 netdev_uninit(&netdev->netdev, true);
747 /* Closes and destroys 'netdev'. */
749 netdev_linux_close(struct netdev *netdev_)
751 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
753 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
759 /* Initializes 'sset' with a list of the names of all known network devices. */
761 netdev_linux_enumerate(struct sset *sset)
763 struct if_nameindex *names;
765 names = if_nameindex();
769 for (i = 0; names[i].if_name != NULL; i++) {
770 sset_add(sset, names[i].if_name);
772 if_freenameindex(names);
775 VLOG_WARN("could not obtain list of network device names: %s",
782 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
784 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
786 if (netdev->fd < 0) {
787 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
792 ssize_t retval = read(netdev->fd, data, size);
795 } else if (errno != EINTR) {
796 if (errno != EAGAIN) {
797 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
798 strerror(errno), netdev_get_name(netdev_));
805 /* Registers with the poll loop to wake up from the next call to poll_block()
806 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
808 netdev_linux_recv_wait(struct netdev *netdev_)
810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
811 if (netdev->fd >= 0) {
812 poll_fd_wait(netdev->fd, POLLIN);
816 /* Discards all packets waiting to be received from 'netdev'. */
818 netdev_linux_drain(struct netdev *netdev_)
820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
821 if (netdev->fd < 0) {
823 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
825 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
826 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
830 drain_fd(netdev->fd, ifr.ifr_qlen);
833 return drain_rcvbuf(netdev->fd);
837 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
838 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
839 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
840 * the packet is too big or too small to transmit on the device.
842 * The caller retains ownership of 'buffer' in all cases.
844 * The kernel maintains a packet transmission queue, so the caller is not
845 * expected to do additional queuing of packets. */
847 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
849 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
853 if (netdev->fd < 0) {
854 /* Use our AF_PACKET socket to send to this device. */
855 struct sockaddr_ll sll;
862 sock = af_packet_sock();
867 error = get_ifindex(netdev_, &ifindex);
872 /* We don't bother setting most fields in sockaddr_ll because the
873 * kernel ignores them for SOCK_RAW. */
874 memset(&sll, 0, sizeof sll);
875 sll.sll_family = AF_PACKET;
876 sll.sll_ifindex = ifindex;
878 iov.iov_base = (void *) data;
882 msg.msg_namelen = sizeof sll;
885 msg.msg_control = NULL;
886 msg.msg_controllen = 0;
889 retval = sendmsg(sock, &msg, 0);
891 /* Use the netdev's own fd to send to this device. This is
892 * essential for tap devices, because packets sent to a tap device
893 * with an AF_PACKET socket will loop back to be *received* again
894 * on the tap device. */
895 retval = write(netdev->fd, data, size);
899 /* The Linux AF_PACKET implementation never blocks waiting for room
900 * for packets, instead returning ENOBUFS. Translate this into
901 * EAGAIN for the caller. */
902 if (errno == ENOBUFS) {
904 } else if (errno == EINTR) {
906 } else if (errno != EAGAIN) {
907 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
908 netdev_get_name(netdev_), strerror(errno));
911 } else if (retval != size) {
912 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
913 "%zu) on %s", retval, size, netdev_get_name(netdev_));
921 /* Registers with the poll loop to wake up from the next call to poll_block()
922 * when the packet transmission queue has sufficient room to transmit a packet
923 * with netdev_send().
925 * The kernel maintains a packet transmission queue, so the client is not
926 * expected to do additional queuing of packets. Thus, this function is
927 * unlikely to ever be used. It is included for completeness. */
929 netdev_linux_send_wait(struct netdev *netdev_)
931 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
932 if (netdev->fd < 0) {
934 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
935 poll_fd_wait(netdev->fd, POLLOUT);
937 /* TAP device always accepts packets.*/
938 poll_immediate_wake();
942 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
943 * otherwise a positive errno value. */
945 netdev_linux_set_etheraddr(struct netdev *netdev_,
946 const uint8_t mac[ETH_ADDR_LEN])
948 struct netdev_dev_linux *netdev_dev =
949 netdev_dev_linux_cast(netdev_get_dev(netdev_));
952 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
953 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
954 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
956 netdev_dev->cache_valid |= VALID_ETHERADDR;
957 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
965 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
966 * free the returned buffer. */
968 netdev_linux_get_etheraddr(const struct netdev *netdev_,
969 uint8_t mac[ETH_ADDR_LEN])
971 struct netdev_dev_linux *netdev_dev =
972 netdev_dev_linux_cast(netdev_get_dev(netdev_));
973 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
974 int error = get_etheraddr(netdev_get_name(netdev_),
975 netdev_dev->etheraddr);
979 netdev_dev->cache_valid |= VALID_ETHERADDR;
981 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
985 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
986 * in bytes, not including the hardware header; thus, this is typically 1500
987 * bytes for Ethernet devices. */
989 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
991 struct netdev_dev_linux *netdev_dev =
992 netdev_dev_linux_cast(netdev_get_dev(netdev_));
993 if (!(netdev_dev->cache_valid & VALID_MTU)) {
997 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
998 SIOCGIFMTU, "SIOCGIFMTU");
1002 netdev_dev->mtu = ifr.ifr_mtu;
1003 netdev_dev->cache_valid |= VALID_MTU;
1005 *mtup = netdev_dev->mtu;
1009 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1010 * On failure, returns a negative errno value. */
1012 netdev_linux_get_ifindex(const struct netdev *netdev)
1016 error = get_ifindex(netdev, &ifindex);
1017 return error ? -error : ifindex;
1021 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1023 struct netdev_dev_linux *netdev_dev =
1024 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1029 if (netdev_dev->miimon_interval > 0) {
1030 *carrier = netdev_dev->miimon;
1034 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1038 fn = xasprintf("/sys/class/net/%s/carrier",
1039 netdev_get_name(netdev_));
1040 fd = open(fn, O_RDONLY);
1043 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1047 retval = read(fd, line, sizeof line);
1050 if (error == EINVAL) {
1051 /* This is the normal return value when we try to check carrier
1052 * if the network device is not up. */
1054 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1057 } else if (retval == 0) {
1059 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1063 if (line[0] != '0' && line[0] != '1') {
1065 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1069 netdev_dev->carrier = line[0] != '0';
1070 netdev_dev->cache_valid |= VALID_CARRIER;
1072 *carrier = netdev_dev->carrier;
1084 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1085 struct mii_ioctl_data *data)
1090 memset(&ifr, 0, sizeof ifr);
1091 memcpy(&ifr.ifr_data, data, sizeof *data);
1092 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1093 memcpy(data, &ifr.ifr_data, sizeof *data);
1099 netdev_linux_get_miimon(const char *name, bool *miimon)
1101 struct mii_ioctl_data data;
1106 memset(&data, 0, sizeof data);
1107 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1109 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1110 data.reg_num = MII_BMSR;
1111 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1115 *miimon = !!(data.val_out & BMSR_LSTATUS);
1117 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1120 struct ethtool_cmd ecmd;
1122 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1125 memset(&ecmd, 0, sizeof ecmd);
1126 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1129 struct ethtool_value eval;
1131 memcpy(&eval, &ecmd, sizeof eval);
1132 *miimon = !!eval.data;
1134 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1142 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1143 long long int interval)
1145 struct netdev_dev_linux *netdev_dev;
1147 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1149 interval = interval > 0 ? MAX(interval, 100) : 0;
1150 if (netdev_dev->miimon_interval != interval) {
1151 netdev_dev->miimon_interval = interval;
1152 timer_set_expired(&netdev_dev->miimon_timer);
1159 netdev_linux_miimon_run(void)
1161 struct shash device_shash;
1162 struct shash_node *node;
1164 shash_init(&device_shash);
1165 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1166 SHASH_FOR_EACH (node, &device_shash) {
1167 struct netdev_dev_linux *dev = node->data;
1170 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1174 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1175 if (miimon != dev->miimon) {
1178 dev->miimon = miimon;
1179 list = shash_find_data(&netdev_linux_notifiers,
1180 dev->netdev_dev.name);
1184 netdev_dev_linux_changed(dev);
1187 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1190 shash_destroy(&device_shash);
1194 netdev_linux_miimon_wait(void)
1196 struct shash device_shash;
1197 struct shash_node *node;
1199 shash_init(&device_shash);
1200 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1201 SHASH_FOR_EACH (node, &device_shash) {
1202 struct netdev_dev_linux *dev = node->data;
1204 if (dev->miimon_interval > 0) {
1205 timer_wait(&dev->miimon_timer);
1208 shash_destroy(&device_shash);
1211 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1212 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1215 check_for_working_netlink_stats(void)
1217 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1218 * preferable, so if that works, we'll use it. */
1219 int ifindex = do_get_ifindex("lo");
1221 VLOG_WARN("failed to get ifindex for lo, "
1222 "obtaining netdev stats from proc");
1225 struct netdev_stats stats;
1226 int error = get_stats_via_netlink(ifindex, &stats);
1228 VLOG_DBG("obtaining netdev stats via rtnetlink");
1231 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1232 "via proc (you are probably running a pre-2.6.19 "
1233 "kernel)", strerror(error));
1239 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1241 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1243 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1244 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1245 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1247 netdev_dev->is_tap = !strcmp(type, "tap");
1248 netdev_dev->is_internal = (!netdev_dev->is_tap
1249 && dpif_linux_is_internal_device(name));
1250 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1255 swap_uint64(uint64_t *a, uint64_t *b)
1262 /* Retrieves current device stats for 'netdev'. */
1264 netdev_linux_get_stats(const struct netdev *netdev_,
1265 struct netdev_stats *stats)
1267 struct netdev_dev_linux *netdev_dev =
1268 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1269 static int use_netlink_stats = -1;
1272 if (netdev_dev->have_vport_stats ||
1273 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1275 error = netdev_vport_get_stats(netdev_, stats);
1276 netdev_dev->have_vport_stats = !error;
1277 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1280 if (!netdev_dev->have_vport_stats) {
1281 if (use_netlink_stats < 0) {
1282 use_netlink_stats = check_for_working_netlink_stats();
1284 if (use_netlink_stats) {
1287 error = get_ifindex(netdev_, &ifindex);
1289 error = get_stats_via_netlink(ifindex, stats);
1292 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1296 /* If this port is an internal port then the transmit and receive stats
1297 * will appear to be swapped relative to the other ports since we are the
1298 * one sending the data, not a remote computer. For consistency, we swap
1299 * them back here. This does not apply if we are getting stats from the
1300 * vport layer because it always tracks stats from the perspective of the
1302 netdev_linux_update_is_pseudo(netdev_dev);
1303 if (!error && !netdev_dev->have_vport_stats &&
1304 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1305 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1306 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1307 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1308 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1309 stats->rx_length_errors = 0;
1310 stats->rx_over_errors = 0;
1311 stats->rx_crc_errors = 0;
1312 stats->rx_frame_errors = 0;
1313 stats->rx_fifo_errors = 0;
1314 stats->rx_missed_errors = 0;
1315 stats->tx_aborted_errors = 0;
1316 stats->tx_carrier_errors = 0;
1317 stats->tx_fifo_errors = 0;
1318 stats->tx_heartbeat_errors = 0;
1319 stats->tx_window_errors = 0;
1325 /* Stores the features supported by 'netdev' into each of '*current',
1326 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1327 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1328 * successful, otherwise a positive errno value. */
1330 netdev_linux_get_features(const struct netdev *netdev,
1331 uint32_t *current, uint32_t *advertised,
1332 uint32_t *supported, uint32_t *peer)
1334 struct ethtool_cmd ecmd;
1337 memset(&ecmd, 0, sizeof ecmd);
1338 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1339 ETHTOOL_GSET, "ETHTOOL_GSET");
1344 /* Supported features. */
1346 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1347 *supported |= OFPPF_10MB_HD;
1349 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1350 *supported |= OFPPF_10MB_FD;
1352 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1353 *supported |= OFPPF_100MB_HD;
1355 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1356 *supported |= OFPPF_100MB_FD;
1358 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1359 *supported |= OFPPF_1GB_HD;
1361 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1362 *supported |= OFPPF_1GB_FD;
1364 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1365 *supported |= OFPPF_10GB_FD;
1367 if (ecmd.supported & SUPPORTED_TP) {
1368 *supported |= OFPPF_COPPER;
1370 if (ecmd.supported & SUPPORTED_FIBRE) {
1371 *supported |= OFPPF_FIBER;
1373 if (ecmd.supported & SUPPORTED_Autoneg) {
1374 *supported |= OFPPF_AUTONEG;
1376 if (ecmd.supported & SUPPORTED_Pause) {
1377 *supported |= OFPPF_PAUSE;
1379 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1380 *supported |= OFPPF_PAUSE_ASYM;
1383 /* Advertised features. */
1385 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1386 *advertised |= OFPPF_10MB_HD;
1388 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1389 *advertised |= OFPPF_10MB_FD;
1391 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1392 *advertised |= OFPPF_100MB_HD;
1394 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1395 *advertised |= OFPPF_100MB_FD;
1397 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1398 *advertised |= OFPPF_1GB_HD;
1400 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1401 *advertised |= OFPPF_1GB_FD;
1403 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1404 *advertised |= OFPPF_10GB_FD;
1406 if (ecmd.advertising & ADVERTISED_TP) {
1407 *advertised |= OFPPF_COPPER;
1409 if (ecmd.advertising & ADVERTISED_FIBRE) {
1410 *advertised |= OFPPF_FIBER;
1412 if (ecmd.advertising & ADVERTISED_Autoneg) {
1413 *advertised |= OFPPF_AUTONEG;
1415 if (ecmd.advertising & ADVERTISED_Pause) {
1416 *advertised |= OFPPF_PAUSE;
1418 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1419 *advertised |= OFPPF_PAUSE_ASYM;
1422 /* Current settings. */
1423 if (ecmd.speed == SPEED_10) {
1424 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1425 } else if (ecmd.speed == SPEED_100) {
1426 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1427 } else if (ecmd.speed == SPEED_1000) {
1428 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1429 } else if (ecmd.speed == SPEED_10000) {
1430 *current = OFPPF_10GB_FD;
1435 if (ecmd.port == PORT_TP) {
1436 *current |= OFPPF_COPPER;
1437 } else if (ecmd.port == PORT_FIBRE) {
1438 *current |= OFPPF_FIBER;
1442 *current |= OFPPF_AUTONEG;
1445 /* Peer advertisements. */
1446 *peer = 0; /* XXX */
1451 /* Set the features advertised by 'netdev' to 'advertise'. */
1453 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1455 struct ethtool_cmd ecmd;
1458 memset(&ecmd, 0, sizeof ecmd);
1459 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1460 ETHTOOL_GSET, "ETHTOOL_GSET");
1465 ecmd.advertising = 0;
1466 if (advertise & OFPPF_10MB_HD) {
1467 ecmd.advertising |= ADVERTISED_10baseT_Half;
1469 if (advertise & OFPPF_10MB_FD) {
1470 ecmd.advertising |= ADVERTISED_10baseT_Full;
1472 if (advertise & OFPPF_100MB_HD) {
1473 ecmd.advertising |= ADVERTISED_100baseT_Half;
1475 if (advertise & OFPPF_100MB_FD) {
1476 ecmd.advertising |= ADVERTISED_100baseT_Full;
1478 if (advertise & OFPPF_1GB_HD) {
1479 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1481 if (advertise & OFPPF_1GB_FD) {
1482 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1484 if (advertise & OFPPF_10GB_FD) {
1485 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1487 if (advertise & OFPPF_COPPER) {
1488 ecmd.advertising |= ADVERTISED_TP;
1490 if (advertise & OFPPF_FIBER) {
1491 ecmd.advertising |= ADVERTISED_FIBRE;
1493 if (advertise & OFPPF_AUTONEG) {
1494 ecmd.advertising |= ADVERTISED_Autoneg;
1496 if (advertise & OFPPF_PAUSE) {
1497 ecmd.advertising |= ADVERTISED_Pause;
1499 if (advertise & OFPPF_PAUSE_ASYM) {
1500 ecmd.advertising |= ADVERTISED_Asym_Pause;
1502 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1503 ETHTOOL_SSET, "ETHTOOL_SSET");
1506 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1507 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1508 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1509 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1510 * sets '*vlan_vid' to -1. */
1512 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1514 const char *netdev_name = netdev_get_name(netdev);
1515 struct ds line = DS_EMPTY_INITIALIZER;
1516 FILE *stream = NULL;
1520 COVERAGE_INC(netdev_get_vlan_vid);
1521 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1522 stream = fopen(fn, "r");
1528 if (ds_get_line(&line, stream)) {
1529 if (ferror(stream)) {
1531 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1534 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1539 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1541 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1542 fn, ds_cstr(&line));
1560 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1561 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1563 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1564 * positive errno value.
1566 * This function is equivalent to running
1567 * /sbin/tc qdisc del dev %s handle ffff: ingress
1568 * but it is much, much faster.
1571 netdev_linux_remove_policing(struct netdev *netdev)
1573 struct netdev_dev_linux *netdev_dev =
1574 netdev_dev_linux_cast(netdev_get_dev(netdev));
1575 const char *netdev_name = netdev_get_name(netdev);
1577 struct ofpbuf request;
1578 struct tcmsg *tcmsg;
1581 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1585 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1586 tcmsg->tcm_parent = TC_H_INGRESS;
1587 nl_msg_put_string(&request, TCA_KIND, "ingress");
1588 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1590 error = tc_transact(&request, NULL);
1591 if (error && error != ENOENT && error != EINVAL) {
1592 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1593 netdev_name, strerror(error));
1597 netdev_dev->kbits_rate = 0;
1598 netdev_dev->kbits_burst = 0;
1599 netdev_dev->cache_valid |= VALID_POLICING;
1603 /* Attempts to set input rate limiting (policing) policy. */
1605 netdev_linux_set_policing(struct netdev *netdev,
1606 uint32_t kbits_rate, uint32_t kbits_burst)
1608 struct netdev_dev_linux *netdev_dev =
1609 netdev_dev_linux_cast(netdev_get_dev(netdev));
1610 const char *netdev_name = netdev_get_name(netdev);
1613 COVERAGE_INC(netdev_set_policing);
1615 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1616 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1617 : kbits_burst); /* Stick with user-specified value. */
1619 if (netdev_dev->cache_valid & VALID_POLICING
1620 && netdev_dev->kbits_rate == kbits_rate
1621 && netdev_dev->kbits_burst == kbits_burst) {
1622 /* Assume that settings haven't changed since we last set them. */
1626 netdev_linux_remove_policing(netdev);
1628 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1629 if (system(command) != 0) {
1630 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1634 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1635 kbits_rate, kbits_burst);
1636 if (system(command) != 0) {
1637 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1642 netdev_dev->kbits_rate = kbits_rate;
1643 netdev_dev->kbits_burst = kbits_burst;
1644 netdev_dev->cache_valid |= VALID_POLICING;
1651 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1654 const struct tc_ops **opsp;
1656 for (opsp = tcs; *opsp != NULL; opsp++) {
1657 const struct tc_ops *ops = *opsp;
1658 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1659 sset_add(types, ops->ovs_name);
1665 static const struct tc_ops *
1666 tc_lookup_ovs_name(const char *name)
1668 const struct tc_ops **opsp;
1670 for (opsp = tcs; *opsp != NULL; opsp++) {
1671 const struct tc_ops *ops = *opsp;
1672 if (!strcmp(name, ops->ovs_name)) {
1679 static const struct tc_ops *
1680 tc_lookup_linux_name(const char *name)
1682 const struct tc_ops **opsp;
1684 for (opsp = tcs; *opsp != NULL; opsp++) {
1685 const struct tc_ops *ops = *opsp;
1686 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1693 static struct tc_queue *
1694 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1697 struct netdev_dev_linux *netdev_dev =
1698 netdev_dev_linux_cast(netdev_get_dev(netdev));
1699 struct tc_queue *queue;
1701 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1702 if (queue->queue_id == queue_id) {
1709 static struct tc_queue *
1710 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1712 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1716 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1718 struct netdev_qos_capabilities *caps)
1720 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1724 caps->n_queues = ops->n_queues;
1729 netdev_linux_get_qos(const struct netdev *netdev,
1730 const char **typep, struct shash *details)
1732 struct netdev_dev_linux *netdev_dev =
1733 netdev_dev_linux_cast(netdev_get_dev(netdev));
1736 error = tc_query_qdisc(netdev);
1741 *typep = netdev_dev->tc->ops->ovs_name;
1742 return (netdev_dev->tc->ops->qdisc_get
1743 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1748 netdev_linux_set_qos(struct netdev *netdev,
1749 const char *type, const struct shash *details)
1751 struct netdev_dev_linux *netdev_dev =
1752 netdev_dev_linux_cast(netdev_get_dev(netdev));
1753 const struct tc_ops *new_ops;
1756 new_ops = tc_lookup_ovs_name(type);
1757 if (!new_ops || !new_ops->tc_install) {
1761 error = tc_query_qdisc(netdev);
1766 if (new_ops == netdev_dev->tc->ops) {
1767 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1769 /* Delete existing qdisc. */
1770 error = tc_del_qdisc(netdev);
1774 assert(netdev_dev->tc == NULL);
1776 /* Install new qdisc. */
1777 error = new_ops->tc_install(netdev, details);
1778 assert((error == 0) == (netdev_dev->tc != NULL));
1785 netdev_linux_get_queue(const struct netdev *netdev,
1786 unsigned int queue_id, struct shash *details)
1788 struct netdev_dev_linux *netdev_dev =
1789 netdev_dev_linux_cast(netdev_get_dev(netdev));
1792 error = tc_query_qdisc(netdev);
1796 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1798 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1804 netdev_linux_set_queue(struct netdev *netdev,
1805 unsigned int queue_id, const struct shash *details)
1807 struct netdev_dev_linux *netdev_dev =
1808 netdev_dev_linux_cast(netdev_get_dev(netdev));
1811 error = tc_query_qdisc(netdev);
1814 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1815 || !netdev_dev->tc->ops->class_set) {
1819 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1823 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1825 struct netdev_dev_linux *netdev_dev =
1826 netdev_dev_linux_cast(netdev_get_dev(netdev));
1829 error = tc_query_qdisc(netdev);
1832 } else if (!netdev_dev->tc->ops->class_delete) {
1835 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1837 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1843 netdev_linux_get_queue_stats(const struct netdev *netdev,
1844 unsigned int queue_id,
1845 struct netdev_queue_stats *stats)
1847 struct netdev_dev_linux *netdev_dev =
1848 netdev_dev_linux_cast(netdev_get_dev(netdev));
1851 error = tc_query_qdisc(netdev);
1854 } else if (!netdev_dev->tc->ops->class_get_stats) {
1857 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1859 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1865 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1867 struct ofpbuf request;
1868 struct tcmsg *tcmsg;
1870 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1874 tcmsg->tcm_parent = 0;
1875 nl_dump_start(dump, rtnl_sock, &request);
1876 ofpbuf_uninit(&request);
1881 netdev_linux_dump_queues(const struct netdev *netdev,
1882 netdev_dump_queues_cb *cb, void *aux)
1884 struct netdev_dev_linux *netdev_dev =
1885 netdev_dev_linux_cast(netdev_get_dev(netdev));
1886 struct tc_queue *queue;
1887 struct shash details;
1891 error = tc_query_qdisc(netdev);
1894 } else if (!netdev_dev->tc->ops->class_get) {
1899 shash_init(&details);
1900 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1901 shash_clear(&details);
1903 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1905 (*cb)(queue->queue_id, &details, aux);
1910 shash_destroy(&details);
1916 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1917 netdev_dump_queue_stats_cb *cb, void *aux)
1919 struct netdev_dev_linux *netdev_dev =
1920 netdev_dev_linux_cast(netdev_get_dev(netdev));
1921 struct nl_dump dump;
1926 error = tc_query_qdisc(netdev);
1929 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1934 if (!start_queue_dump(netdev, &dump)) {
1937 while (nl_dump_next(&dump, &msg)) {
1938 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1944 error = nl_dump_done(&dump);
1945 return error ? error : last_error;
1949 netdev_linux_get_in4(const struct netdev *netdev_,
1950 struct in_addr *address, struct in_addr *netmask)
1952 struct netdev_dev_linux *netdev_dev =
1953 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1955 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1958 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1959 SIOCGIFADDR, "SIOCGIFADDR");
1964 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1965 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1970 netdev_dev->cache_valid |= VALID_IN4;
1972 *address = netdev_dev->address;
1973 *netmask = netdev_dev->netmask;
1974 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1978 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1979 struct in_addr netmask)
1981 struct netdev_dev_linux *netdev_dev =
1982 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1985 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1987 netdev_dev->cache_valid |= VALID_IN4;
1988 netdev_dev->address = address;
1989 netdev_dev->netmask = netmask;
1990 if (address.s_addr != INADDR_ANY) {
1991 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1992 "SIOCSIFNETMASK", netmask);
1999 parse_if_inet6_line(const char *line,
2000 struct in6_addr *in6, char ifname[16 + 1])
2002 uint8_t *s6 = in6->s6_addr;
2003 #define X8 "%2"SCNx8
2005 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2006 "%*x %*x %*x %*x %16s\n",
2007 &s6[0], &s6[1], &s6[2], &s6[3],
2008 &s6[4], &s6[5], &s6[6], &s6[7],
2009 &s6[8], &s6[9], &s6[10], &s6[11],
2010 &s6[12], &s6[13], &s6[14], &s6[15],
2014 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2015 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2017 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2019 struct netdev_dev_linux *netdev_dev =
2020 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2021 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2025 netdev_dev->in6 = in6addr_any;
2027 file = fopen("/proc/net/if_inet6", "r");
2029 const char *name = netdev_get_name(netdev_);
2030 while (fgets(line, sizeof line, file)) {
2031 struct in6_addr in6_tmp;
2032 char ifname[16 + 1];
2033 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2034 && !strcmp(name, ifname))
2036 netdev_dev->in6 = in6_tmp;
2042 netdev_dev->cache_valid |= VALID_IN6;
2044 *in6 = netdev_dev->in6;
2049 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2051 struct sockaddr_in sin;
2052 memset(&sin, 0, sizeof sin);
2053 sin.sin_family = AF_INET;
2054 sin.sin_addr = addr;
2057 memset(sa, 0, sizeof *sa);
2058 memcpy(sa, &sin, sizeof sin);
2062 do_set_addr(struct netdev *netdev,
2063 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2066 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2067 make_in4_sockaddr(&ifr.ifr_addr, addr);
2069 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2073 /* Adds 'router' as a default IP gateway. */
2075 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2077 struct in_addr any = { INADDR_ANY };
2081 memset(&rt, 0, sizeof rt);
2082 make_in4_sockaddr(&rt.rt_dst, any);
2083 make_in4_sockaddr(&rt.rt_gateway, router);
2084 make_in4_sockaddr(&rt.rt_genmask, any);
2085 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2086 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2088 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2094 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2097 static const char fn[] = "/proc/net/route";
2102 *netdev_name = NULL;
2103 stream = fopen(fn, "r");
2104 if (stream == NULL) {
2105 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2110 while (fgets(line, sizeof line, stream)) {
2113 ovs_be32 dest, gateway, mask;
2114 int refcnt, metric, mtu;
2115 unsigned int flags, use, window, irtt;
2118 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2120 iface, &dest, &gateway, &flags, &refcnt,
2121 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2123 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2127 if (!(flags & RTF_UP)) {
2128 /* Skip routes that aren't up. */
2132 /* The output of 'dest', 'mask', and 'gateway' were given in
2133 * network byte order, so we don't need need any endian
2134 * conversions here. */
2135 if ((dest & mask) == (host->s_addr & mask)) {
2137 /* The host is directly reachable. */
2138 next_hop->s_addr = 0;
2140 /* To reach the host, we must go through a gateway. */
2141 next_hop->s_addr = gateway;
2143 *netdev_name = xstrdup(iface);
2155 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2157 struct ethtool_drvinfo drvinfo;
2160 memset(&drvinfo, 0, sizeof drvinfo);
2161 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2162 (struct ethtool_cmd *)&drvinfo,
2164 "ETHTOOL_GDRVINFO");
2166 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2167 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2168 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2174 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2175 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2176 * returns 0. Otherwise, it returns a positive errno value; in particular,
2177 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2179 netdev_linux_arp_lookup(const struct netdev *netdev,
2180 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2183 struct sockaddr_in sin;
2186 memset(&r, 0, sizeof r);
2187 memset(&sin, 0, sizeof sin);
2188 sin.sin_family = AF_INET;
2189 sin.sin_addr.s_addr = ip;
2191 memcpy(&r.arp_pa, &sin, sizeof sin);
2192 r.arp_ha.sa_family = ARPHRD_ETHER;
2194 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2195 COVERAGE_INC(netdev_arp_lookup);
2196 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2198 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2199 } else if (retval != ENXIO) {
2200 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2201 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2207 nd_to_iff_flags(enum netdev_flags nd)
2210 if (nd & NETDEV_UP) {
2213 if (nd & NETDEV_PROMISC) {
2220 iff_to_nd_flags(int iff)
2222 enum netdev_flags nd = 0;
2226 if (iff & IFF_PROMISC) {
2227 nd |= NETDEV_PROMISC;
2233 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2234 enum netdev_flags on, enum netdev_flags *old_flagsp)
2236 int old_flags, new_flags;
2239 error = get_flags(netdev, &old_flags);
2241 *old_flagsp = iff_to_nd_flags(old_flags);
2242 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2243 if (new_flags != old_flags) {
2244 error = set_flags(netdev, new_flags);
2251 poll_notify(struct list *list)
2253 struct netdev_linux_notifier *notifier;
2254 LIST_FOR_EACH (notifier, node, list) {
2255 struct netdev_notifier *n = ¬ifier->notifier;
2261 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2262 void *aux OVS_UNUSED)
2265 struct list *list = shash_find_data(&netdev_linux_notifiers,
2271 struct shash_node *node;
2272 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2273 poll_notify(node->data);
2279 netdev_linux_poll_add(struct netdev *netdev,
2280 void (*cb)(struct netdev_notifier *), void *aux,
2281 struct netdev_notifier **notifierp)
2283 const char *netdev_name = netdev_get_name(netdev);
2284 struct netdev_linux_notifier *notifier;
2287 if (shash_is_empty(&netdev_linux_notifiers)) {
2289 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2290 netdev_linux_poll_cb, NULL);
2296 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2298 list = xmalloc(sizeof *list);
2300 shash_add(&netdev_linux_notifiers, netdev_name, list);
2303 notifier = xmalloc(sizeof *notifier);
2304 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2305 list_push_back(list, ¬ifier->node);
2306 *notifierp = ¬ifier->notifier;
2311 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2313 struct netdev_linux_notifier *notifier =
2314 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2317 /* Remove 'notifier' from its list. */
2318 list = list_remove(¬ifier->node);
2319 if (list_is_empty(list)) {
2320 /* The list is now empty. Remove it from the hash and free it. */
2321 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2322 shash_delete(&netdev_linux_notifiers,
2323 shash_find(&netdev_linux_notifiers, netdev_name));
2328 /* If that was the last notifier, unregister. */
2329 if (shash_is_empty(&netdev_linux_notifiers)) {
2330 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2335 netdev_linux_change_seq(const struct netdev *netdev)
2337 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2340 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2344 netdev_linux_init, \
2346 netdev_linux_wait, \
2349 netdev_linux_destroy, \
2350 NULL, /* set_config */ \
2352 netdev_linux_open, \
2353 netdev_linux_close, \
2357 netdev_linux_recv, \
2358 netdev_linux_recv_wait, \
2359 netdev_linux_drain, \
2361 netdev_linux_send, \
2362 netdev_linux_send_wait, \
2364 netdev_linux_set_etheraddr, \
2365 netdev_linux_get_etheraddr, \
2366 netdev_linux_get_mtu, \
2367 netdev_linux_get_ifindex, \
2368 netdev_linux_get_carrier, \
2369 netdev_linux_set_miimon_interval, \
2370 netdev_linux_get_stats, \
2373 netdev_linux_get_features, \
2374 netdev_linux_set_advertisements, \
2375 netdev_linux_get_vlan_vid, \
2377 netdev_linux_set_policing, \
2378 netdev_linux_get_qos_types, \
2379 netdev_linux_get_qos_capabilities, \
2380 netdev_linux_get_qos, \
2381 netdev_linux_set_qos, \
2382 netdev_linux_get_queue, \
2383 netdev_linux_set_queue, \
2384 netdev_linux_delete_queue, \
2385 netdev_linux_get_queue_stats, \
2386 netdev_linux_dump_queues, \
2387 netdev_linux_dump_queue_stats, \
2389 netdev_linux_get_in4, \
2390 netdev_linux_set_in4, \
2391 netdev_linux_get_in6, \
2392 netdev_linux_add_router, \
2393 netdev_linux_get_next_hop, \
2394 netdev_linux_get_status, \
2395 netdev_linux_arp_lookup, \
2397 netdev_linux_update_flags, \
2399 netdev_linux_poll_add, \
2400 netdev_linux_poll_remove, \
2401 netdev_linux_change_seq \
2404 const struct netdev_class netdev_linux_class =
2407 netdev_linux_create,
2408 netdev_linux_enumerate,
2409 NULL); /* set_stats */
2411 const struct netdev_class netdev_tap_class =
2414 netdev_linux_create_tap,
2415 NULL, /* enumerate */
2416 NULL); /* set_stats */
2418 const struct netdev_class netdev_internal_class =
2421 netdev_linux_create,
2422 NULL, /* enumerate */
2423 netdev_vport_set_stats);
2425 /* HTB traffic control class. */
2427 #define HTB_N_QUEUES 0xf000
2431 unsigned int max_rate; /* In bytes/s. */
2435 struct tc_queue tc_queue;
2436 unsigned int min_rate; /* In bytes/s. */
2437 unsigned int max_rate; /* In bytes/s. */
2438 unsigned int burst; /* In bytes. */
2439 unsigned int priority; /* Lower values are higher priorities. */
2443 htb_get__(const struct netdev *netdev)
2445 struct netdev_dev_linux *netdev_dev =
2446 netdev_dev_linux_cast(netdev_get_dev(netdev));
2447 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2451 htb_install__(struct netdev *netdev, uint64_t max_rate)
2453 struct netdev_dev_linux *netdev_dev =
2454 netdev_dev_linux_cast(netdev_get_dev(netdev));
2457 htb = xmalloc(sizeof *htb);
2458 tc_init(&htb->tc, &tc_ops_htb);
2459 htb->max_rate = max_rate;
2461 netdev_dev->tc = &htb->tc;
2464 /* Create an HTB qdisc.
2466 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2468 htb_setup_qdisc__(struct netdev *netdev)
2471 struct tc_htb_glob opt;
2472 struct ofpbuf request;
2473 struct tcmsg *tcmsg;
2475 tc_del_qdisc(netdev);
2477 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2478 NLM_F_EXCL | NLM_F_CREATE, &request);
2482 tcmsg->tcm_handle = tc_make_handle(1, 0);
2483 tcmsg->tcm_parent = TC_H_ROOT;
2485 nl_msg_put_string(&request, TCA_KIND, "htb");
2487 memset(&opt, 0, sizeof opt);
2488 opt.rate2quantum = 10;
2492 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2493 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2494 nl_msg_end_nested(&request, opt_offset);
2496 return tc_transact(&request, NULL);
2499 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2500 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2502 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2503 unsigned int parent, struct htb_class *class)
2506 struct tc_htb_opt opt;
2507 struct ofpbuf request;
2508 struct tcmsg *tcmsg;
2512 netdev_get_mtu(netdev, &mtu);
2513 if (mtu == INT_MAX) {
2514 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2515 netdev_get_name(netdev));
2519 memset(&opt, 0, sizeof opt);
2520 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2521 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2522 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2523 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2524 opt.prio = class->priority;
2526 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2530 tcmsg->tcm_handle = handle;
2531 tcmsg->tcm_parent = parent;
2533 nl_msg_put_string(&request, TCA_KIND, "htb");
2534 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2535 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2536 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2537 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2538 nl_msg_end_nested(&request, opt_offset);
2540 error = tc_transact(&request, NULL);
2542 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2543 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2544 netdev_get_name(netdev),
2545 tc_get_major(handle), tc_get_minor(handle),
2546 tc_get_major(parent), tc_get_minor(parent),
2547 class->min_rate, class->max_rate,
2548 class->burst, class->priority, strerror(error));
2553 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2554 * description of them into 'details'. The description complies with the
2555 * specification given in the vswitch database documentation for linux-htb
2558 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2560 static const struct nl_policy tca_htb_policy[] = {
2561 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2562 .min_len = sizeof(struct tc_htb_opt) },
2565 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2566 const struct tc_htb_opt *htb;
2568 if (!nl_parse_nested(nl_options, tca_htb_policy,
2569 attrs, ARRAY_SIZE(tca_htb_policy))) {
2570 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2574 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2575 class->min_rate = htb->rate.rate;
2576 class->max_rate = htb->ceil.rate;
2577 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2578 class->priority = htb->prio;
2583 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2584 struct htb_class *options,
2585 struct netdev_queue_stats *stats)
2587 struct nlattr *nl_options;
2588 unsigned int handle;
2591 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2592 if (!error && queue_id) {
2593 unsigned int major = tc_get_major(handle);
2594 unsigned int minor = tc_get_minor(handle);
2595 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2596 *queue_id = minor - 1;
2601 if (!error && options) {
2602 error = htb_parse_tca_options__(nl_options, options);
2608 htb_parse_qdisc_details__(struct netdev *netdev,
2609 const struct shash *details, struct htb_class *hc)
2611 const char *max_rate_s;
2613 max_rate_s = shash_find_data(details, "max-rate");
2614 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2615 if (!hc->max_rate) {
2618 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2619 hc->max_rate = netdev_features_to_bps(current) / 8;
2621 hc->min_rate = hc->max_rate;
2627 htb_parse_class_details__(struct netdev *netdev,
2628 const struct shash *details, struct htb_class *hc)
2630 const struct htb *htb = htb_get__(netdev);
2631 const char *min_rate_s = shash_find_data(details, "min-rate");
2632 const char *max_rate_s = shash_find_data(details, "max-rate");
2633 const char *burst_s = shash_find_data(details, "burst");
2634 const char *priority_s = shash_find_data(details, "priority");
2637 netdev_get_mtu(netdev, &mtu);
2638 if (mtu == INT_MAX) {
2639 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2640 netdev_get_name(netdev));
2644 /* HTB requires at least an mtu sized min-rate to send any traffic even
2645 * on uncongested links. */
2646 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2647 hc->min_rate = MAX(hc->min_rate, mtu);
2648 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2651 hc->max_rate = (max_rate_s
2652 ? strtoull(max_rate_s, NULL, 10) / 8
2654 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2655 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2659 * According to hints in the documentation that I've read, it is important
2660 * that 'burst' be at least as big as the largest frame that might be
2661 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2662 * but having it a bit too small is a problem. Since netdev_get_mtu()
2663 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2664 * the MTU. We actually add 64, instead of 14, as a guard against
2665 * additional headers get tacked on somewhere that we're not aware of. */
2666 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2667 hc->burst = MAX(hc->burst, mtu + 64);
2670 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2676 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2677 unsigned int parent, struct htb_class *options,
2678 struct netdev_queue_stats *stats)
2680 struct ofpbuf *reply;
2683 error = tc_query_class(netdev, handle, parent, &reply);
2685 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2686 ofpbuf_delete(reply);
2692 htb_tc_install(struct netdev *netdev, const struct shash *details)
2696 error = htb_setup_qdisc__(netdev);
2698 struct htb_class hc;
2700 htb_parse_qdisc_details__(netdev, details, &hc);
2701 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2702 tc_make_handle(1, 0), &hc);
2704 htb_install__(netdev, hc.max_rate);
2710 static struct htb_class *
2711 htb_class_cast__(const struct tc_queue *queue)
2713 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2717 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2718 const struct htb_class *hc)
2720 struct htb *htb = htb_get__(netdev);
2721 size_t hash = hash_int(queue_id, 0);
2722 struct tc_queue *queue;
2723 struct htb_class *hcp;
2725 queue = tc_find_queue__(netdev, queue_id, hash);
2727 hcp = htb_class_cast__(queue);
2729 hcp = xmalloc(sizeof *hcp);
2730 queue = &hcp->tc_queue;
2731 queue->queue_id = queue_id;
2732 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2735 hcp->min_rate = hc->min_rate;
2736 hcp->max_rate = hc->max_rate;
2737 hcp->burst = hc->burst;
2738 hcp->priority = hc->priority;
2742 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2745 struct nl_dump dump;
2746 struct htb_class hc;
2748 /* Get qdisc options. */
2750 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2751 htb_install__(netdev, hc.max_rate);
2754 if (!start_queue_dump(netdev, &dump)) {
2757 while (nl_dump_next(&dump, &msg)) {
2758 unsigned int queue_id;
2760 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2761 htb_update_queue__(netdev, queue_id, &hc);
2764 nl_dump_done(&dump);
2770 htb_tc_destroy(struct tc *tc)
2772 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2773 struct htb_class *hc, *next;
2775 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2776 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2784 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2786 const struct htb *htb = htb_get__(netdev);
2787 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2792 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2794 struct htb_class hc;
2797 htb_parse_qdisc_details__(netdev, details, &hc);
2798 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2799 tc_make_handle(1, 0), &hc);
2801 htb_get__(netdev)->max_rate = hc.max_rate;
2807 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2808 const struct tc_queue *queue, struct shash *details)
2810 const struct htb_class *hc = htb_class_cast__(queue);
2812 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2813 if (hc->min_rate != hc->max_rate) {
2814 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2816 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2818 shash_add(details, "priority", xasprintf("%u", hc->priority));
2824 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2825 const struct shash *details)
2827 struct htb_class hc;
2830 error = htb_parse_class_details__(netdev, details, &hc);
2835 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2836 tc_make_handle(1, 0xfffe), &hc);
2841 htb_update_queue__(netdev, queue_id, &hc);
2846 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2848 struct htb_class *hc = htb_class_cast__(queue);
2849 struct htb *htb = htb_get__(netdev);
2852 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2854 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2861 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2862 struct netdev_queue_stats *stats)
2864 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2865 tc_make_handle(1, 0xfffe), NULL, stats);
2869 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2870 const struct ofpbuf *nlmsg,
2871 netdev_dump_queue_stats_cb *cb, void *aux)
2873 struct netdev_queue_stats stats;
2874 unsigned int handle, major, minor;
2877 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2882 major = tc_get_major(handle);
2883 minor = tc_get_minor(handle);
2884 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2885 (*cb)(minor - 1, &stats, aux);
2890 static const struct tc_ops tc_ops_htb = {
2891 "htb", /* linux_name */
2892 "linux-htb", /* ovs_name */
2893 HTB_N_QUEUES, /* n_queues */
2902 htb_class_get_stats,
2903 htb_class_dump_stats
2906 /* "linux-hfsc" traffic control class. */
2908 #define HFSC_N_QUEUES 0xf000
2916 struct tc_queue tc_queue;
2921 static struct hfsc *
2922 hfsc_get__(const struct netdev *netdev)
2924 struct netdev_dev_linux *netdev_dev;
2925 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2926 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2929 static struct hfsc_class *
2930 hfsc_class_cast__(const struct tc_queue *queue)
2932 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2936 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2938 struct netdev_dev_linux * netdev_dev;
2941 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2942 hfsc = xmalloc(sizeof *hfsc);
2943 tc_init(&hfsc->tc, &tc_ops_hfsc);
2944 hfsc->max_rate = max_rate;
2945 netdev_dev->tc = &hfsc->tc;
2949 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2950 const struct hfsc_class *hc)
2954 struct hfsc_class *hcp;
2955 struct tc_queue *queue;
2957 hfsc = hfsc_get__(netdev);
2958 hash = hash_int(queue_id, 0);
2960 queue = tc_find_queue__(netdev, queue_id, hash);
2962 hcp = hfsc_class_cast__(queue);
2964 hcp = xmalloc(sizeof *hcp);
2965 queue = &hcp->tc_queue;
2966 queue->queue_id = queue_id;
2967 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2970 hcp->min_rate = hc->min_rate;
2971 hcp->max_rate = hc->max_rate;
2975 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2977 const struct tc_service_curve *rsc, *fsc, *usc;
2978 static const struct nl_policy tca_hfsc_policy[] = {
2980 .type = NL_A_UNSPEC,
2982 .min_len = sizeof(struct tc_service_curve),
2985 .type = NL_A_UNSPEC,
2987 .min_len = sizeof(struct tc_service_curve),
2990 .type = NL_A_UNSPEC,
2992 .min_len = sizeof(struct tc_service_curve),
2995 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2997 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2998 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2999 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3003 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3004 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3005 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3007 if (rsc->m1 != 0 || rsc->d != 0 ||
3008 fsc->m1 != 0 || fsc->d != 0 ||
3009 usc->m1 != 0 || usc->d != 0) {
3010 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3011 "Non-linear service curves are not supported.");
3015 if (rsc->m2 != fsc->m2) {
3016 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3017 "Real-time service curves are not supported ");
3021 if (rsc->m2 > usc->m2) {
3022 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3023 "Min-rate service curve is greater than "
3024 "the max-rate service curve.");
3028 class->min_rate = fsc->m2;
3029 class->max_rate = usc->m2;
3034 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3035 struct hfsc_class *options,
3036 struct netdev_queue_stats *stats)
3039 unsigned int handle;
3040 struct nlattr *nl_options;
3042 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3048 unsigned int major, minor;
3050 major = tc_get_major(handle);
3051 minor = tc_get_minor(handle);
3052 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3053 *queue_id = minor - 1;
3060 error = hfsc_parse_tca_options__(nl_options, options);
3067 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3068 unsigned int parent, struct hfsc_class *options,
3069 struct netdev_queue_stats *stats)
3072 struct ofpbuf *reply;
3074 error = tc_query_class(netdev, handle, parent, &reply);
3079 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3080 ofpbuf_delete(reply);
3085 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3086 struct hfsc_class *class)
3089 const char *max_rate_s;
3091 max_rate_s = shash_find_data(details, "max-rate");
3092 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3097 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3098 max_rate = netdev_features_to_bps(current) / 8;
3101 class->min_rate = max_rate;
3102 class->max_rate = max_rate;
3106 hfsc_parse_class_details__(struct netdev *netdev,
3107 const struct shash *details,
3108 struct hfsc_class * class)
3110 const struct hfsc *hfsc;
3111 uint32_t min_rate, max_rate;
3112 const char *min_rate_s, *max_rate_s;
3114 hfsc = hfsc_get__(netdev);
3115 min_rate_s = shash_find_data(details, "min-rate");
3116 max_rate_s = shash_find_data(details, "max-rate");
3118 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3119 min_rate = MAX(min_rate, 1);
3120 min_rate = MIN(min_rate, hfsc->max_rate);
3122 max_rate = (max_rate_s
3123 ? strtoull(max_rate_s, NULL, 10) / 8
3125 max_rate = MAX(max_rate, min_rate);
3126 max_rate = MIN(max_rate, hfsc->max_rate);
3128 class->min_rate = min_rate;
3129 class->max_rate = max_rate;
3134 /* Create an HFSC qdisc.
3136 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3138 hfsc_setup_qdisc__(struct netdev * netdev)
3140 struct tcmsg *tcmsg;
3141 struct ofpbuf request;
3142 struct tc_hfsc_qopt opt;
3144 tc_del_qdisc(netdev);
3146 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3147 NLM_F_EXCL | NLM_F_CREATE, &request);
3153 tcmsg->tcm_handle = tc_make_handle(1, 0);
3154 tcmsg->tcm_parent = TC_H_ROOT;
3156 memset(&opt, 0, sizeof opt);
3159 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3160 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3162 return tc_transact(&request, NULL);
3165 /* Create an HFSC class.
3167 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3168 * sc rate <min_rate> ul rate <max_rate>" */
3170 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3171 unsigned int parent, struct hfsc_class *class)
3175 struct tcmsg *tcmsg;
3176 struct ofpbuf request;
3177 struct tc_service_curve min, max;
3179 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3185 tcmsg->tcm_handle = handle;
3186 tcmsg->tcm_parent = parent;
3190 min.m2 = class->min_rate;
3194 max.m2 = class->max_rate;
3196 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3197 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3198 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3199 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3200 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3201 nl_msg_end_nested(&request, opt_offset);
3203 error = tc_transact(&request, NULL);
3205 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3206 "min-rate %ubps, max-rate %ubps (%s)",
3207 netdev_get_name(netdev),
3208 tc_get_major(handle), tc_get_minor(handle),
3209 tc_get_major(parent), tc_get_minor(parent),
3210 class->min_rate, class->max_rate, strerror(error));
3217 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3220 struct hfsc_class class;
3222 error = hfsc_setup_qdisc__(netdev);
3228 hfsc_parse_qdisc_details__(netdev, details, &class);
3229 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3230 tc_make_handle(1, 0), &class);
3236 hfsc_install__(netdev, class.max_rate);
3241 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3244 struct nl_dump dump;
3245 struct hfsc_class hc;
3248 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3249 hfsc_install__(netdev, hc.max_rate);
3251 if (!start_queue_dump(netdev, &dump)) {
3255 while (nl_dump_next(&dump, &msg)) {
3256 unsigned int queue_id;
3258 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3259 hfsc_update_queue__(netdev, queue_id, &hc);
3263 nl_dump_done(&dump);
3268 hfsc_tc_destroy(struct tc *tc)
3271 struct hfsc_class *hc, *next;
3273 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3275 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3276 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3285 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3287 const struct hfsc *hfsc;
3288 hfsc = hfsc_get__(netdev);
3289 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3294 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3297 struct hfsc_class class;
3299 hfsc_parse_qdisc_details__(netdev, details, &class);
3300 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3301 tc_make_handle(1, 0), &class);
3304 hfsc_get__(netdev)->max_rate = class.max_rate;
3311 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3312 const struct tc_queue *queue, struct shash *details)
3314 const struct hfsc_class *hc;
3316 hc = hfsc_class_cast__(queue);
3317 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3318 if (hc->min_rate != hc->max_rate) {
3319 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3325 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3326 const struct shash *details)
3329 struct hfsc_class class;
3331 error = hfsc_parse_class_details__(netdev, details, &class);
3336 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3337 tc_make_handle(1, 0xfffe), &class);
3342 hfsc_update_queue__(netdev, queue_id, &class);
3347 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3351 struct hfsc_class *hc;
3353 hc = hfsc_class_cast__(queue);
3354 hfsc = hfsc_get__(netdev);
3356 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3358 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3365 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3366 struct netdev_queue_stats *stats)
3368 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3369 tc_make_handle(1, 0xfffe), NULL, stats);
3373 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3374 const struct ofpbuf *nlmsg,
3375 netdev_dump_queue_stats_cb *cb, void *aux)
3377 struct netdev_queue_stats stats;
3378 unsigned int handle, major, minor;
3381 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3386 major = tc_get_major(handle);
3387 minor = tc_get_minor(handle);
3388 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3389 (*cb)(minor - 1, &stats, aux);
3394 static const struct tc_ops tc_ops_hfsc = {
3395 "hfsc", /* linux_name */
3396 "linux-hfsc", /* ovs_name */
3397 HFSC_N_QUEUES, /* n_queues */
3398 hfsc_tc_install, /* tc_install */
3399 hfsc_tc_load, /* tc_load */
3400 hfsc_tc_destroy, /* tc_destroy */
3401 hfsc_qdisc_get, /* qdisc_get */
3402 hfsc_qdisc_set, /* qdisc_set */
3403 hfsc_class_get, /* class_get */
3404 hfsc_class_set, /* class_set */
3405 hfsc_class_delete, /* class_delete */
3406 hfsc_class_get_stats, /* class_get_stats */
3407 hfsc_class_dump_stats /* class_dump_stats */
3410 /* "linux-default" traffic control class.
3412 * This class represents the default, unnamed Linux qdisc. It corresponds to
3413 * the "" (empty string) QoS type in the OVS database. */
3416 default_install__(struct netdev *netdev)
3418 struct netdev_dev_linux *netdev_dev =
3419 netdev_dev_linux_cast(netdev_get_dev(netdev));
3420 static struct tc *tc;
3423 tc = xmalloc(sizeof *tc);
3424 tc_init(tc, &tc_ops_default);
3426 netdev_dev->tc = tc;
3430 default_tc_install(struct netdev *netdev,
3431 const struct shash *details OVS_UNUSED)
3433 default_install__(netdev);
3438 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3440 default_install__(netdev);
3444 static const struct tc_ops tc_ops_default = {
3445 NULL, /* linux_name */
3450 NULL, /* tc_destroy */
3451 NULL, /* qdisc_get */
3452 NULL, /* qdisc_set */
3453 NULL, /* class_get */
3454 NULL, /* class_set */
3455 NULL, /* class_delete */
3456 NULL, /* class_get_stats */
3457 NULL /* class_dump_stats */
3460 /* "linux-other" traffic control class.
3465 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3467 struct netdev_dev_linux *netdev_dev =
3468 netdev_dev_linux_cast(netdev_get_dev(netdev));
3469 static struct tc *tc;
3472 tc = xmalloc(sizeof *tc);
3473 tc_init(tc, &tc_ops_other);
3475 netdev_dev->tc = tc;
3479 static const struct tc_ops tc_ops_other = {
3480 NULL, /* linux_name */
3481 "linux-other", /* ovs_name */
3483 NULL, /* tc_install */
3485 NULL, /* tc_destroy */
3486 NULL, /* qdisc_get */
3487 NULL, /* qdisc_set */
3488 NULL, /* class_get */
3489 NULL, /* class_set */
3490 NULL, /* class_delete */
3491 NULL, /* class_get_stats */
3492 NULL /* class_dump_stats */
3495 /* Traffic control. */
3497 /* Number of kernel "tc" ticks per second. */
3498 static double ticks_per_s;
3500 /* Number of kernel "jiffies" per second. This is used for the purpose of
3501 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3502 * one jiffy's worth of data.
3504 * There are two possibilities here:
3506 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3507 * approximate range of 100 to 1024. That means that we really need to
3508 * make sure that the qdisc can buffer that much data.
3510 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3511 * has finely granular timers and there's no need to fudge additional room
3512 * for buffers. (There's no extra effort needed to implement that: the
3513 * large 'buffer_hz' is used as a divisor, so practically any number will
3514 * come out as 0 in the division. Small integer results in the case of
3515 * really high dividends won't have any real effect anyhow.)
3517 static unsigned int buffer_hz;
3519 /* Returns tc handle 'major':'minor'. */
3521 tc_make_handle(unsigned int major, unsigned int minor)
3523 return TC_H_MAKE(major << 16, minor);
3526 /* Returns the major number from 'handle'. */
3528 tc_get_major(unsigned int handle)
3530 return TC_H_MAJ(handle) >> 16;
3533 /* Returns the minor number from 'handle'. */
3535 tc_get_minor(unsigned int handle)
3537 return TC_H_MIN(handle);
3540 static struct tcmsg *
3541 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3542 struct ofpbuf *request)
3544 struct tcmsg *tcmsg;
3548 error = get_ifindex(netdev, &ifindex);
3553 ofpbuf_init(request, 512);
3554 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3555 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3556 tcmsg->tcm_family = AF_UNSPEC;
3557 tcmsg->tcm_ifindex = ifindex;
3558 /* Caller should fill in tcmsg->tcm_handle. */
3559 /* Caller should fill in tcmsg->tcm_parent. */
3565 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3567 int error = nl_sock_transact(rtnl_sock, request, replyp);
3568 ofpbuf_uninit(request);
3575 /* The values in psched are not individually very meaningful, but they are
3576 * important. The tables below show some values seen in the wild.
3580 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3581 * (Before that, there are hints that it was 1000000000.)
3583 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3587 * -----------------------------------
3588 * [1] 000c8000 000f4240 000f4240 00000064
3589 * [2] 000003e8 00000400 000f4240 3b9aca00
3590 * [3] 000003e8 00000400 000f4240 3b9aca00
3591 * [4] 000003e8 00000400 000f4240 00000064
3592 * [5] 000003e8 00000040 000f4240 3b9aca00
3593 * [6] 000003e8 00000040 000f4240 000000f9
3595 * a b c d ticks_per_s buffer_hz
3596 * ------- --------- ---------- ------------- ----------- -------------
3597 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3598 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3599 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3600 * [4] 1,000 1,024 1,000,000 100 976,562 100
3601 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3602 * [6] 1,000 64 1,000,000 249 15,625,000 249
3604 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3605 * [2] 2.6.26-1-686-bigmem from Debian lenny
3606 * [3] 2.6.26-2-sparc64 from Debian lenny
3607 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3608 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3609 * [6] 2.6.34 from kernel.org on KVM
3611 static const char fn[] = "/proc/net/psched";
3612 unsigned int a, b, c, d;
3618 stream = fopen(fn, "r");
3620 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3624 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3625 VLOG_WARN("%s: read failed", fn);
3629 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3633 VLOG_WARN("%s: invalid scheduler parameters", fn);
3637 ticks_per_s = (double) a * c / b;
3641 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3644 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3647 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3648 * rate of 'rate' bytes per second. */
3650 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3655 return (rate * ticks) / ticks_per_s;
3658 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3659 * rate of 'rate' bytes per second. */
3661 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3666 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3669 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3670 * a transmission rate of 'rate' bytes per second. */
3672 tc_buffer_per_jiffy(unsigned int rate)
3677 return rate / buffer_hz;
3680 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3681 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3682 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3683 * stores NULL into it if it is absent.
3685 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3688 * Returns 0 if successful, otherwise a positive errno value. */
3690 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3691 struct nlattr **options)
3693 static const struct nl_policy tca_policy[] = {
3694 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3695 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3697 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3699 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3700 tca_policy, ta, ARRAY_SIZE(ta))) {
3701 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3706 *kind = nl_attr_get_string(ta[TCA_KIND]);
3710 *options = ta[TCA_OPTIONS];
3725 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3726 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3727 * into '*options', and its queue statistics into '*stats'. Any of the output
3728 * arguments may be null.
3730 * Returns 0 if successful, otherwise a positive errno value. */
3732 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3733 struct nlattr **options, struct netdev_queue_stats *stats)
3735 static const struct nl_policy tca_policy[] = {
3736 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3737 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3739 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3741 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3742 tca_policy, ta, ARRAY_SIZE(ta))) {
3743 VLOG_WARN_RL(&rl, "failed to parse class message");
3748 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3749 *handlep = tc->tcm_handle;
3753 *options = ta[TCA_OPTIONS];
3757 const struct gnet_stats_queue *gsq;
3758 struct gnet_stats_basic gsb;
3760 static const struct nl_policy stats_policy[] = {
3761 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3762 .min_len = sizeof gsb },
3763 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3764 .min_len = sizeof *gsq },
3766 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3768 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3769 sa, ARRAY_SIZE(sa))) {
3770 VLOG_WARN_RL(&rl, "failed to parse class stats");
3774 /* Alignment issues screw up the length of struct gnet_stats_basic on
3775 * some arch/bitsize combinations. Newer versions of Linux have a
3776 * struct gnet_stats_basic_packed, but we can't depend on that. The
3777 * easiest thing to do is just to make a copy. */
3778 memset(&gsb, 0, sizeof gsb);
3779 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3780 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3781 stats->tx_bytes = gsb.bytes;
3782 stats->tx_packets = gsb.packets;
3784 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3785 stats->tx_errors = gsq->drops;
3795 memset(stats, 0, sizeof *stats);
3800 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3803 tc_query_class(const struct netdev *netdev,
3804 unsigned int handle, unsigned int parent,
3805 struct ofpbuf **replyp)
3807 struct ofpbuf request;
3808 struct tcmsg *tcmsg;
3811 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3815 tcmsg->tcm_handle = handle;
3816 tcmsg->tcm_parent = parent;
3818 error = tc_transact(&request, replyp);
3820 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3821 netdev_get_name(netdev),
3822 tc_get_major(handle), tc_get_minor(handle),
3823 tc_get_major(parent), tc_get_minor(parent),
3829 /* Equivalent to "tc class del dev <name> handle <handle>". */
3831 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3833 struct ofpbuf request;
3834 struct tcmsg *tcmsg;
3837 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3841 tcmsg->tcm_handle = handle;
3842 tcmsg->tcm_parent = 0;
3844 error = tc_transact(&request, NULL);
3846 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3847 netdev_get_name(netdev),
3848 tc_get_major(handle), tc_get_minor(handle),
3854 /* Equivalent to "tc qdisc del dev <name> root". */
3856 tc_del_qdisc(struct netdev *netdev)
3858 struct netdev_dev_linux *netdev_dev =
3859 netdev_dev_linux_cast(netdev_get_dev(netdev));
3860 struct ofpbuf request;
3861 struct tcmsg *tcmsg;
3864 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3868 tcmsg->tcm_handle = tc_make_handle(1, 0);
3869 tcmsg->tcm_parent = TC_H_ROOT;
3871 error = tc_transact(&request, NULL);
3872 if (error == EINVAL) {
3873 /* EINVAL probably means that the default qdisc was in use, in which
3874 * case we've accomplished our purpose. */
3877 if (!error && netdev_dev->tc) {
3878 if (netdev_dev->tc->ops->tc_destroy) {
3879 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3881 netdev_dev->tc = NULL;
3886 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3887 * kernel to determine what they are. Returns 0 if successful, otherwise a
3888 * positive errno value. */
3890 tc_query_qdisc(const struct netdev *netdev)
3892 struct netdev_dev_linux *netdev_dev =
3893 netdev_dev_linux_cast(netdev_get_dev(netdev));
3894 struct ofpbuf request, *qdisc;
3895 const struct tc_ops *ops;
3896 struct tcmsg *tcmsg;
3900 if (netdev_dev->tc) {
3904 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3905 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3906 * 2.6.35 without that fix backported to it.
3908 * To avoid the OOPS, we must not make a request that would attempt to dump
3909 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3910 * few others. There are a few ways that I can see to do this, but most of
3911 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3912 * technique chosen here is to assume that any non-default qdisc that we
3913 * create will have a class with handle 1:0. The built-in qdiscs only have
3914 * a class with handle 0:0.
3916 * We could check for Linux 2.6.35+ and use a more straightforward method
3918 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3922 tcmsg->tcm_handle = tc_make_handle(1, 0);
3923 tcmsg->tcm_parent = 0;
3925 /* Figure out what tc class to instantiate. */
3926 error = tc_transact(&request, &qdisc);
3930 error = tc_parse_qdisc(qdisc, &kind, NULL);
3932 ops = &tc_ops_other;
3934 ops = tc_lookup_linux_name(kind);
3936 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3937 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3939 ops = &tc_ops_other;
3942 } else if (error == ENOENT) {
3943 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3944 * other entity that doesn't have a handle 1:0. We will assume
3945 * that it's the system default qdisc. */
3946 ops = &tc_ops_default;
3949 /* Who knows? Maybe the device got deleted. */
3950 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3951 netdev_get_name(netdev), strerror(error));
3952 ops = &tc_ops_other;
3955 /* Instantiate it. */
3956 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3957 assert((load_error == 0) == (netdev_dev->tc != NULL));
3958 ofpbuf_delete(qdisc);
3960 return error ? error : load_error;
3963 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3964 approximate the time to transmit packets of various lengths. For an MTU of
3965 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3966 represents two possible packet lengths; for a MTU of 513 through 1024, four
3967 possible lengths; and so on.
3969 Returns, for the specified 'mtu', the number of bits that packet lengths
3970 need to be shifted right to fit within such a 256-entry table. */
3972 tc_calc_cell_log(unsigned int mtu)
3977 mtu = ETH_PAYLOAD_MAX;
3979 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3981 for (cell_log = 0; mtu >= 256; cell_log++) {
3988 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3991 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3993 memset(rate, 0, sizeof *rate);
3994 rate->cell_log = tc_calc_cell_log(mtu);
3995 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3996 /* rate->cell_align = 0; */ /* distro headers. */
3997 rate->mpu = ETH_TOTAL_MIN;
4001 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4002 * attribute of the specified "type".
4004 * See tc_calc_cell_log() above for a description of "rtab"s. */
4006 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4011 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4012 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4013 unsigned packet_size = (i + 1) << rate->cell_log;
4014 if (packet_size < rate->mpu) {
4015 packet_size = rate->mpu;
4017 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4021 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4022 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4023 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4026 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4028 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4029 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4032 /* Public utility functions. */
4034 #define COPY_NETDEV_STATS \
4035 dst->rx_packets = src->rx_packets; \
4036 dst->tx_packets = src->tx_packets; \
4037 dst->rx_bytes = src->rx_bytes; \
4038 dst->tx_bytes = src->tx_bytes; \
4039 dst->rx_errors = src->rx_errors; \
4040 dst->tx_errors = src->tx_errors; \
4041 dst->rx_dropped = src->rx_dropped; \
4042 dst->tx_dropped = src->tx_dropped; \
4043 dst->multicast = src->multicast; \
4044 dst->collisions = src->collisions; \
4045 dst->rx_length_errors = src->rx_length_errors; \
4046 dst->rx_over_errors = src->rx_over_errors; \
4047 dst->rx_crc_errors = src->rx_crc_errors; \
4048 dst->rx_frame_errors = src->rx_frame_errors; \
4049 dst->rx_fifo_errors = src->rx_fifo_errors; \
4050 dst->rx_missed_errors = src->rx_missed_errors; \
4051 dst->tx_aborted_errors = src->tx_aborted_errors; \
4052 dst->tx_carrier_errors = src->tx_carrier_errors; \
4053 dst->tx_fifo_errors = src->tx_fifo_errors; \
4054 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
4055 dst->tx_window_errors = src->tx_window_errors
4057 /* Copies 'src' into 'dst', performing format conversion in the process. */
4059 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4060 const struct rtnl_link_stats *src)
4065 /* Copies 'src' into 'dst', performing format conversion in the process. */
4067 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
4068 const struct rtnl_link_stats64 *src)
4073 /* Copies 'src' into 'dst', performing format conversion in the process. */
4075 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
4076 const struct netdev_stats *src)
4079 dst->rx_compressed = 0;
4080 dst->tx_compressed = 0;
4083 /* Utility functions. */
4086 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4088 /* Policy for RTNLGRP_LINK messages.
4090 * There are *many* more fields in these messages, but currently we only
4091 * care about these fields. */
4092 static const struct nl_policy rtnlgrp_link_policy[] = {
4093 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4094 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4095 .min_len = sizeof(struct rtnl_link_stats) },
4098 struct ofpbuf request;
4099 struct ofpbuf *reply;
4100 struct ifinfomsg *ifi;
4101 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4104 ofpbuf_init(&request, 0);
4105 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4106 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4107 ifi->ifi_family = PF_UNSPEC;
4108 ifi->ifi_index = ifindex;
4109 error = nl_sock_transact(rtnl_sock, &request, &reply);
4110 ofpbuf_uninit(&request);
4115 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4116 rtnlgrp_link_policy,
4117 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4118 ofpbuf_delete(reply);
4122 if (!attrs[IFLA_STATS]) {
4123 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4124 ofpbuf_delete(reply);
4128 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4130 ofpbuf_delete(reply);
4136 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4138 static const char fn[] = "/proc/net/dev";
4143 stream = fopen(fn, "r");
4145 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4150 while (fgets(line, sizeof line, stream)) {
4153 #define X64 "%"SCNu64
4156 X64 X64 X64 X64 X64 X64 X64 "%*u"
4157 X64 X64 X64 X64 X64 X64 X64 "%*u",
4163 &stats->rx_fifo_errors,
4164 &stats->rx_frame_errors,
4170 &stats->tx_fifo_errors,
4172 &stats->tx_carrier_errors) != 15) {
4173 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4174 } else if (!strcmp(devname, netdev_name)) {
4175 stats->rx_length_errors = UINT64_MAX;
4176 stats->rx_over_errors = UINT64_MAX;
4177 stats->rx_crc_errors = UINT64_MAX;
4178 stats->rx_missed_errors = UINT64_MAX;
4179 stats->tx_aborted_errors = UINT64_MAX;
4180 stats->tx_heartbeat_errors = UINT64_MAX;
4181 stats->tx_window_errors = UINT64_MAX;
4187 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4193 get_flags(const struct netdev *netdev, int *flags)
4198 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4200 *flags = ifr.ifr_flags;
4205 set_flags(struct netdev *netdev, int flags)
4209 ifr.ifr_flags = flags;
4210 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4215 do_get_ifindex(const char *netdev_name)
4219 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4220 COVERAGE_INC(netdev_get_ifindex);
4221 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4222 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4223 netdev_name, strerror(errno));
4226 return ifr.ifr_ifindex;
4230 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4232 struct netdev_dev_linux *netdev_dev =
4233 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4235 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4236 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4240 netdev_dev->cache_valid |= VALID_IFINDEX;
4241 netdev_dev->ifindex = ifindex;
4243 *ifindexp = netdev_dev->ifindex;
4248 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4253 memset(&ifr, 0, sizeof ifr);
4254 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4255 COVERAGE_INC(netdev_get_hwaddr);
4256 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4257 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4258 netdev_name, strerror(errno));
4261 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4262 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4263 VLOG_WARN("%s device has unknown hardware address family %d",
4264 netdev_name, hwaddr_family);
4266 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4271 set_etheraddr(const char *netdev_name, int hwaddr_family,
4272 const uint8_t mac[ETH_ADDR_LEN])
4276 memset(&ifr, 0, sizeof ifr);
4277 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4278 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4279 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4280 COVERAGE_INC(netdev_set_hwaddr);
4281 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4282 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4283 netdev_name, strerror(errno));
4290 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4291 int cmd, const char *cmd_name)
4295 memset(&ifr, 0, sizeof ifr);
4296 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4297 ifr.ifr_data = (caddr_t) ecmd;
4300 COVERAGE_INC(netdev_ethtool);
4301 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4304 if (errno != EOPNOTSUPP) {
4305 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4306 "failed: %s", cmd_name, name, strerror(errno));
4308 /* The device doesn't support this operation. That's pretty
4309 * common, so there's no point in logging anything. */
4316 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4317 const char *cmd_name)
4319 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4320 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4321 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4329 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4330 int cmd, const char *cmd_name)
4335 ifr.ifr_addr.sa_family = AF_INET;
4336 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4338 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4339 *ip = sin->sin_addr;
4344 /* Returns an AF_PACKET raw socket or a negative errno value. */
4346 af_packet_sock(void)
4348 static int sock = INT_MIN;
4350 if (sock == INT_MIN) {
4351 sock = socket(AF_PACKET, SOCK_RAW, 0);
4353 set_nonblocking(sock);
4356 VLOG_ERR("failed to create packet socket: %s", strerror(errno));