2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_get_vlan_vid);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
95 #define TC_RTAB_SIZE 1024
98 static struct rtnetlink_notifier netdev_linux_cache_notifier;
99 static int cache_notifier_refcount;
102 VALID_IFINDEX = 1 << 0,
103 VALID_ETHERADDR = 1 << 1,
107 VALID_CARRIER = 1 << 5,
108 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
109 VALID_POLICING = 1 << 7,
110 VALID_HAVE_VPORT_STATS = 1 << 8
118 /* Traffic control. */
120 /* An instance of a traffic control class. Always associated with a particular
123 * Each TC implementation subclasses this with whatever additional data it
126 const struct tc_ops *ops;
127 struct hmap queues; /* Contains "struct tc_queue"s.
128 * Read by generic TC layer.
129 * Written only by TC implementation. */
132 /* One traffic control queue.
134 * Each TC implementation subclasses this with whatever additional data it
137 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
138 unsigned int queue_id; /* OpenFlow queue ID. */
141 /* A particular kind of traffic control. Each implementation generally maps to
142 * one particular Linux qdisc class.
144 * The functions below return 0 if successful or a positive errno value on
145 * failure, except where otherwise noted. All of them must be provided, except
146 * where otherwise noted. */
148 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
149 * This is null for tc_ops_default and tc_ops_other, for which there are no
150 * appropriate values. */
151 const char *linux_name;
153 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
154 const char *ovs_name;
156 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
157 * queues. The queues are numbered 0 through n_queues - 1. */
158 unsigned int n_queues;
160 /* Called to install this TC class on 'netdev'. The implementation should
161 * make the Netlink calls required to set up 'netdev' with the right qdisc
162 * and configure it according to 'details'. The implementation may assume
163 * that the current qdisc is the default; that is, there is no need for it
164 * to delete the current qdisc before installing itself.
166 * The contents of 'details' should be documented as valid for 'ovs_name'
167 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
168 * (which is built as ovs-vswitchd.conf.db(8)).
170 * This function must return 0 if and only if it sets 'netdev->tc' to an
171 * initialized 'struct tc'.
173 * (This function is null for tc_ops_other, which cannot be installed. For
174 * other TC classes it should always be nonnull.) */
175 int (*tc_install)(struct netdev *netdev, const struct shash *details);
177 /* Called when the netdev code determines (through a Netlink query) that
178 * this TC class's qdisc is installed on 'netdev', but we didn't install
179 * it ourselves and so don't know any of the details.
181 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
182 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
183 * implementation should parse the other attributes of 'nlmsg' as
184 * necessary to determine its configuration. If necessary it should also
185 * use Netlink queries to determine the configuration of queues on
188 * This function must return 0 if and only if it sets 'netdev->tc' to an
189 * initialized 'struct tc'. */
190 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
192 /* Destroys the data structures allocated by the implementation as part of
193 * 'tc'. (This includes destroying 'tc->queues' by calling
196 * The implementation should not need to perform any Netlink calls. If
197 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
198 * (But it may not be desirable.)
200 * This function may be null if 'tc' is trivial. */
201 void (*tc_destroy)(struct tc *tc);
203 /* Retrieves details of 'netdev->tc' configuration into 'details'.
205 * The implementation should not need to perform any Netlink calls, because
206 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
207 * cached the configuration.
209 * The contents of 'details' should be documented as valid for 'ovs_name'
210 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
211 * (which is built as ovs-vswitchd.conf.db(8)).
213 * This function may be null if 'tc' is not configurable.
215 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
217 /* Reconfigures 'netdev->tc' according to 'details', performing any
218 * required Netlink calls to complete the reconfiguration.
220 * The contents of 'details' should be documented as valid for 'ovs_name'
221 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
222 * (which is built as ovs-vswitchd.conf.db(8)).
224 * This function may be null if 'tc' is not configurable.
226 int (*qdisc_set)(struct netdev *, const struct shash *details);
228 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
229 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
231 * The contents of 'details' should be documented as valid for 'ovs_name'
232 * in the "other_config" column in the "Queue" table in
233 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
235 * The implementation should not need to perform any Netlink calls, because
236 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
237 * cached the queue configuration.
239 * This function may be null if 'tc' does not have queues ('n_queues' is
241 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
242 struct shash *details);
244 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
245 * 'details', perfoming any required Netlink calls to complete the
246 * reconfiguration. The caller ensures that 'queue_id' is less than
249 * The contents of 'details' should be documented as valid for 'ovs_name'
250 * in the "other_config" column in the "Queue" table in
251 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
253 * This function may be null if 'tc' does not have queues or its queues are
254 * not configurable. */
255 int (*class_set)(struct netdev *, unsigned int queue_id,
256 const struct shash *details);
258 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
259 * tc_queue's within 'netdev->tc->queues'.
261 * This function may be null if 'tc' does not have queues or its queues
262 * cannot be deleted. */
263 int (*class_delete)(struct netdev *, struct tc_queue *queue);
265 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
266 * 'struct tc_queue's within 'netdev->tc->queues'.
268 * On success, initializes '*stats'.
270 * This function may be null if 'tc' does not have queues or if it cannot
271 * report queue statistics. */
272 int (*class_get_stats)(const struct netdev *netdev,
273 const struct tc_queue *queue,
274 struct netdev_queue_stats *stats);
276 /* Extracts queue stats from 'nlmsg', which is a response to a
277 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_dump_stats)(const struct netdev *netdev,
282 const struct ofpbuf *nlmsg,
283 netdev_dump_queue_stats_cb *cb, void *aux);
287 tc_init(struct tc *tc, const struct tc_ops *ops)
290 hmap_init(&tc->queues);
294 tc_destroy(struct tc *tc)
296 hmap_destroy(&tc->queues);
299 static const struct tc_ops tc_ops_htb;
300 static const struct tc_ops tc_ops_hfsc;
301 static const struct tc_ops tc_ops_default;
302 static const struct tc_ops tc_ops_other;
304 static const struct tc_ops *tcs[] = {
305 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
306 &tc_ops_hfsc, /* Hierarchical fair service curve. */
307 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
308 &tc_ops_other, /* Some other qdisc. */
312 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
313 static unsigned int tc_get_major(unsigned int handle);
314 static unsigned int tc_get_minor(unsigned int handle);
316 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
317 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
318 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
320 static struct tcmsg *tc_make_request(const struct netdev *, int type,
321 unsigned int flags, struct ofpbuf *);
322 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
324 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
325 struct nlattr **options);
326 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
327 struct nlattr **options,
328 struct netdev_queue_stats *);
329 static int tc_query_class(const struct netdev *,
330 unsigned int handle, unsigned int parent,
331 struct ofpbuf **replyp);
332 static int tc_delete_class(const struct netdev *, unsigned int handle);
334 static int tc_del_qdisc(struct netdev *netdev);
335 static int tc_query_qdisc(const struct netdev *netdev);
337 static int tc_calc_cell_log(unsigned int mtu);
338 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
339 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
340 const struct tc_ratespec *rate);
341 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
343 struct netdev_dev_linux {
344 struct netdev_dev netdev_dev;
346 struct shash_node *shash_node;
347 unsigned int cache_valid;
349 /* The following are figured out "on demand" only. They are only valid
350 * when the corresponding VALID_* bit in 'cache_valid' is set. */
352 uint8_t etheraddr[ETH_ADDR_LEN];
353 struct in_addr address, netmask;
357 bool is_internal; /* Is this an openvswitch internal device? */
358 bool is_tap; /* Is this a tuntap device? */
359 uint32_t kbits_rate; /* Policing data. */
360 uint32_t kbits_burst;
361 bool have_vport_stats;
365 struct tap_state tap;
369 struct netdev_linux {
370 struct netdev netdev;
374 /* Sockets used for ioctl operations. */
375 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
376 static int af_packet_sock = -1; /* AF_PACKET, SOCK_RAW. */
378 /* A Netlink routing socket that is not subscribed to any multicast groups. */
379 static struct nl_sock *rtnl_sock;
381 struct netdev_linux_notifier {
382 struct netdev_notifier notifier;
386 static struct shash netdev_linux_notifiers =
387 SHASH_INITIALIZER(&netdev_linux_notifiers);
388 static struct rtnetlink_notifier netdev_linux_poll_notifier;
390 /* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394 static int netdev_linux_init(void);
396 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397 int cmd, const char *cmd_name);
398 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
400 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
402 static int get_flags(const struct netdev *, int *flagsp);
403 static int set_flags(struct netdev *, int flags);
404 static int do_get_ifindex(const char *netdev_name);
405 static int get_ifindex(const struct netdev *, int *ifindexp);
406 static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
416 is_netdev_linux_class(const struct netdev_class *netdev_class)
418 return netdev_class->init == netdev_linux_init;
421 static struct netdev_dev_linux *
422 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
424 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
425 assert(is_netdev_linux_class(netdev_class));
427 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
430 static struct netdev_linux *
431 netdev_linux_cast(const struct netdev *netdev)
433 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
434 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
435 assert(is_netdev_linux_class(netdev_class));
437 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
441 netdev_linux_init(void)
443 static int status = -1;
445 /* Create AF_INET socket. */
446 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
447 status = af_inet_sock >= 0 ? 0 : errno;
449 VLOG_ERR("failed to create inet socket: %s", strerror(status));
451 /* Create AF_PACKET socket. */
452 af_packet_sock = socket(AF_PACKET, SOCK_RAW, 0);
453 status = af_packet_sock >= 0 ? 0 : errno;
455 VLOG_ERR("failed to create packet socket: %s",
458 set_nonblocking(af_packet_sock);
461 /* Create rtnetlink socket. */
463 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
465 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
474 netdev_linux_run(void)
476 rtnetlink_link_notifier_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_notifier_wait();
486 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
487 void *aux OVS_UNUSED)
489 struct netdev_dev_linux *dev;
491 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
493 const struct netdev_class *netdev_class =
494 netdev_dev_get_class(base_dev);
496 if (is_netdev_linux_class(netdev_class)) {
497 dev = netdev_dev_linux_cast(base_dev);
498 dev->cache_valid = 0;
502 struct shash device_shash;
503 struct shash_node *node;
505 shash_init(&device_shash);
506 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
507 SHASH_FOR_EACH (node, &device_shash) {
509 dev->cache_valid = 0;
511 shash_destroy(&device_shash);
515 /* Creates system and internal devices. */
517 netdev_linux_create(const struct netdev_class *class,
518 const char *name, const struct shash *args,
519 struct netdev_dev **netdev_devp)
521 struct netdev_dev_linux *netdev_dev;
524 if (!shash_is_empty(args)) {
525 VLOG_WARN("%s: arguments for %s devices should be empty",
529 if (!cache_notifier_refcount) {
530 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
531 netdev_linux_cache_cb, NULL);
536 cache_notifier_refcount++;
538 netdev_dev = xzalloc(sizeof *netdev_dev);
539 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
541 *netdev_devp = &netdev_dev->netdev_dev;
545 /* For most types of netdevs we open the device for each call of
546 * netdev_open(). However, this is not the case with tap devices,
547 * since it is only possible to open the device once. In this
548 * situation we share a single file descriptor, and consequently
549 * buffers, across all readers. Therefore once data is read it will
550 * be unavailable to other reads for tap devices. */
552 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
553 const char *name, const struct shash *args,
554 struct netdev_dev **netdev_devp)
556 struct netdev_dev_linux *netdev_dev;
557 struct tap_state *state;
558 static const char tap_dev[] = "/dev/net/tun";
562 if (!shash_is_empty(args)) {
563 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
566 netdev_dev = xzalloc(sizeof *netdev_dev);
567 state = &netdev_dev->state.tap;
569 /* Open tap device. */
570 state->fd = open(tap_dev, O_RDWR);
573 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
577 /* Create tap device. */
578 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
579 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
580 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
581 VLOG_WARN("%s: creating tap device failed: %s", name,
587 /* Make non-blocking. */
588 error = set_nonblocking(state->fd);
593 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
594 *netdev_devp = &netdev_dev->netdev_dev;
603 destroy_tap(struct netdev_dev_linux *netdev_dev)
605 struct tap_state *state = &netdev_dev->state.tap;
607 if (state->fd >= 0) {
612 /* Destroys the netdev device 'netdev_dev_'. */
614 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
616 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
617 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
619 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
620 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
623 if (class == &netdev_linux_class || class == &netdev_internal_class) {
624 cache_notifier_refcount--;
626 if (!cache_notifier_refcount) {
627 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
629 } else if (class == &netdev_tap_class) {
630 destroy_tap(netdev_dev);
639 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
640 struct netdev **netdevp)
642 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
643 struct netdev_linux *netdev;
644 enum netdev_flags flags;
647 /* Allocate network device. */
648 netdev = xzalloc(sizeof *netdev);
650 netdev_init(&netdev->netdev, netdev_dev_);
652 /* Verify that the device really exists, by attempting to read its flags.
653 * (The flags might be cached, in which case this won't actually do an
656 * Don't do this for "internal" netdevs, though, because those have to be
657 * created as netdev objects before they exist in the kernel, because
658 * creating them in the kernel happens by passing a netdev object to
659 * dpif_port_add(). */
660 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
661 error = netdev_get_flags(&netdev->netdev, &flags);
662 if (error == ENODEV) {
667 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
668 !netdev_dev->state.tap.opened) {
670 /* We assume that the first user of the tap device is the primary user
671 * and give them the tap FD. Subsequent users probably just expect
672 * this to be a system device so open it normally to avoid send/receive
673 * directions appearing to be reversed. */
674 netdev->fd = netdev_dev->state.tap.fd;
675 netdev_dev->state.tap.opened = true;
676 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
677 struct sockaddr_ll sll;
681 /* Create file descriptor. */
682 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
683 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
685 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
686 if (netdev->fd < 0) {
691 /* Set non-blocking mode. */
692 error = set_nonblocking(netdev->fd);
697 /* Get ethernet device index. */
698 error = get_ifindex(&netdev->netdev, &ifindex);
703 /* Bind to specific ethernet device. */
704 memset(&sll, 0, sizeof sll);
705 sll.sll_family = AF_PACKET;
706 sll.sll_ifindex = ifindex;
708 (struct sockaddr *) &sll, sizeof sll) < 0) {
710 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
715 /* Between the socket() and bind() calls above, the socket receives all
716 * packets of the requested type on all system interfaces. We do not
717 * want to receive that data, but there is no way to avoid it. So we
718 * must now drain out the receive queue. */
719 error = drain_rcvbuf(netdev->fd);
725 *netdevp = &netdev->netdev;
729 netdev_uninit(&netdev->netdev, true);
733 /* Closes and destroys 'netdev'. */
735 netdev_linux_close(struct netdev *netdev_)
737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
739 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
745 /* Initializes 'sset' with a list of the names of all known network devices. */
747 netdev_linux_enumerate(struct sset *sset)
749 struct if_nameindex *names;
751 names = if_nameindex();
755 for (i = 0; names[i].if_name != NULL; i++) {
756 sset_add(sset, names[i].if_name);
758 if_freenameindex(names);
761 VLOG_WARN("could not obtain list of network device names: %s",
768 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
772 if (netdev->fd < 0) {
773 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
778 ssize_t retval = read(netdev->fd, data, size);
781 } else if (errno != EINTR) {
782 if (errno != EAGAIN) {
783 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
784 strerror(errno), netdev_get_name(netdev_));
791 /* Registers with the poll loop to wake up from the next call to poll_block()
792 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
794 netdev_linux_recv_wait(struct netdev *netdev_)
796 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
797 if (netdev->fd >= 0) {
798 poll_fd_wait(netdev->fd, POLLIN);
802 /* Discards all packets waiting to be received from 'netdev'. */
804 netdev_linux_drain(struct netdev *netdev_)
806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
807 if (netdev->fd < 0) {
809 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
811 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
812 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
816 drain_fd(netdev->fd, ifr.ifr_qlen);
819 return drain_rcvbuf(netdev->fd);
823 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
824 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
825 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
826 * the packet is too big or too small to transmit on the device.
828 * The caller retains ownership of 'buffer' in all cases.
830 * The kernel maintains a packet transmission queue, so the caller is not
831 * expected to do additional queuing of packets. */
833 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 if (netdev->fd < 0) {
840 /* Use our AF_PACKET socket to send to this device. */
841 struct sockaddr_ll sll;
847 error = get_ifindex(netdev_, &ifindex);
852 /* We don't bother setting most fields in sockaddr_ll because the
853 * kernel ignores them for SOCK_RAW. */
854 memset(&sll, 0, sizeof sll);
855 sll.sll_family = AF_PACKET;
856 sll.sll_ifindex = ifindex;
858 iov.iov_base = (void *) data;
862 msg.msg_namelen = sizeof sll;
865 msg.msg_control = NULL;
866 msg.msg_controllen = 0;
869 retval = sendmsg(af_packet_sock, &msg, 0);
871 /* Use the netdev's own fd to send to this device. This is
872 * essential for tap devices, because packets sent to a tap device
873 * with an AF_PACKET socket will loop back to be *received* again
874 * on the tap device. */
875 retval = write(netdev->fd, data, size);
879 /* The Linux AF_PACKET implementation never blocks waiting for room
880 * for packets, instead returning ENOBUFS. Translate this into
881 * EAGAIN for the caller. */
882 if (errno == ENOBUFS) {
884 } else if (errno == EINTR) {
886 } else if (errno != EAGAIN) {
887 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
888 netdev_get_name(netdev_), strerror(errno));
891 } else if (retval != size) {
892 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
893 "%zu) on %s", retval, size, netdev_get_name(netdev_));
901 /* Registers with the poll loop to wake up from the next call to poll_block()
902 * when the packet transmission queue has sufficient room to transmit a packet
903 * with netdev_send().
905 * The kernel maintains a packet transmission queue, so the client is not
906 * expected to do additional queuing of packets. Thus, this function is
907 * unlikely to ever be used. It is included for completeness. */
909 netdev_linux_send_wait(struct netdev *netdev_)
911 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
912 if (netdev->fd < 0) {
914 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
915 poll_fd_wait(netdev->fd, POLLOUT);
917 /* TAP device always accepts packets.*/
918 poll_immediate_wake();
922 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
923 * otherwise a positive errno value. */
925 netdev_linux_set_etheraddr(struct netdev *netdev_,
926 const uint8_t mac[ETH_ADDR_LEN])
928 struct netdev_dev_linux *netdev_dev =
929 netdev_dev_linux_cast(netdev_get_dev(netdev_));
932 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
933 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
934 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
936 netdev_dev->cache_valid |= VALID_ETHERADDR;
937 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
945 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
946 * free the returned buffer. */
948 netdev_linux_get_etheraddr(const struct netdev *netdev_,
949 uint8_t mac[ETH_ADDR_LEN])
951 struct netdev_dev_linux *netdev_dev =
952 netdev_dev_linux_cast(netdev_get_dev(netdev_));
953 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
954 int error = get_etheraddr(netdev_get_name(netdev_),
955 netdev_dev->etheraddr);
959 netdev_dev->cache_valid |= VALID_ETHERADDR;
961 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
965 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
966 * in bytes, not including the hardware header; thus, this is typically 1500
967 * bytes for Ethernet devices. */
969 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
971 struct netdev_dev_linux *netdev_dev =
972 netdev_dev_linux_cast(netdev_get_dev(netdev_));
973 if (!(netdev_dev->cache_valid & VALID_MTU)) {
977 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
978 SIOCGIFMTU, "SIOCGIFMTU");
982 netdev_dev->mtu = ifr.ifr_mtu;
983 netdev_dev->cache_valid |= VALID_MTU;
985 *mtup = netdev_dev->mtu;
989 /* Returns the ifindex of 'netdev', if successful, as a positive number.
990 * On failure, returns a negative errno value. */
992 netdev_linux_get_ifindex(const struct netdev *netdev)
996 error = get_ifindex(netdev, &ifindex);
997 return error ? -error : ifindex;
1001 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1003 struct netdev_dev_linux *netdev_dev =
1004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1009 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1013 fn = xasprintf("/sys/class/net/%s/carrier",
1014 netdev_get_name(netdev_));
1015 fd = open(fn, O_RDONLY);
1018 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1022 retval = read(fd, line, sizeof line);
1025 if (error == EINVAL) {
1026 /* This is the normal return value when we try to check carrier
1027 * if the network device is not up. */
1029 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1032 } else if (retval == 0) {
1034 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1038 if (line[0] != '0' && line[0] != '1') {
1040 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1044 netdev_dev->carrier = line[0] != '0';
1045 netdev_dev->cache_valid |= VALID_CARRIER;
1047 *carrier = netdev_dev->carrier;
1059 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1060 const char *cmd_name, struct mii_ioctl_data *data)
1065 memset(&ifr, 0, sizeof ifr);
1066 memcpy(&ifr.ifr_data, data, sizeof *data);
1067 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1068 &ifr, cmd, cmd_name);
1069 memcpy(data, &ifr.ifr_data, sizeof *data);
1075 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1077 const char *name = netdev_get_name(netdev);
1078 struct mii_ioctl_data data;
1083 memset(&data, 0, sizeof data);
1084 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1086 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1087 data.reg_num = MII_BMSR;
1088 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1092 *miimon = !!(data.val_out & BMSR_LSTATUS);
1094 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1097 struct ethtool_cmd ecmd;
1099 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1102 memset(&ecmd, 0, sizeof ecmd);
1103 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1106 struct ethtool_value eval;
1108 memcpy(&eval, &ecmd, sizeof eval);
1109 *miimon = !!eval.data;
1111 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1118 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1119 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1122 check_for_working_netlink_stats(void)
1124 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1125 * preferable, so if that works, we'll use it. */
1126 int ifindex = do_get_ifindex("lo");
1128 VLOG_WARN("failed to get ifindex for lo, "
1129 "obtaining netdev stats from proc");
1132 struct netdev_stats stats;
1133 int error = get_stats_via_netlink(ifindex, &stats);
1135 VLOG_DBG("obtaining netdev stats via rtnetlink");
1138 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1139 "via proc (you are probably running a pre-2.6.19 "
1140 "kernel)", strerror(error));
1146 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1148 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1150 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1151 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1152 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1154 netdev_dev->is_tap = !strcmp(type, "tap");
1155 netdev_dev->is_internal = (!netdev_dev->is_tap
1156 && dpif_linux_is_internal_device(name));
1157 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1162 swap_uint64(uint64_t *a, uint64_t *b)
1169 /* Retrieves current device stats for 'netdev'. */
1171 netdev_linux_get_stats(const struct netdev *netdev_,
1172 struct netdev_stats *stats)
1174 struct netdev_dev_linux *netdev_dev =
1175 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1176 static int use_netlink_stats = -1;
1179 if (netdev_dev->have_vport_stats ||
1180 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1182 error = netdev_vport_get_stats(netdev_, stats);
1183 netdev_dev->have_vport_stats = !error;
1184 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1187 if (!netdev_dev->have_vport_stats) {
1188 if (use_netlink_stats < 0) {
1189 use_netlink_stats = check_for_working_netlink_stats();
1191 if (use_netlink_stats) {
1194 error = get_ifindex(netdev_, &ifindex);
1196 error = get_stats_via_netlink(ifindex, stats);
1199 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1203 /* If this port is an internal port then the transmit and receive stats
1204 * will appear to be swapped relative to the other ports since we are the
1205 * one sending the data, not a remote computer. For consistency, we swap
1206 * them back here. This does not apply if we are getting stats from the
1207 * vport layer because it always tracks stats from the perspective of the
1209 netdev_linux_update_is_pseudo(netdev_dev);
1210 if (!error && !netdev_dev->have_vport_stats &&
1211 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1212 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1213 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1214 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1215 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1216 stats->rx_length_errors = 0;
1217 stats->rx_over_errors = 0;
1218 stats->rx_crc_errors = 0;
1219 stats->rx_frame_errors = 0;
1220 stats->rx_fifo_errors = 0;
1221 stats->rx_missed_errors = 0;
1222 stats->tx_aborted_errors = 0;
1223 stats->tx_carrier_errors = 0;
1224 stats->tx_fifo_errors = 0;
1225 stats->tx_heartbeat_errors = 0;
1226 stats->tx_window_errors = 0;
1232 /* Stores the features supported by 'netdev' into each of '*current',
1233 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1234 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1235 * successful, otherwise a positive errno value. */
1237 netdev_linux_get_features(const struct netdev *netdev,
1238 uint32_t *current, uint32_t *advertised,
1239 uint32_t *supported, uint32_t *peer)
1241 struct ethtool_cmd ecmd;
1244 memset(&ecmd, 0, sizeof ecmd);
1245 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1246 ETHTOOL_GSET, "ETHTOOL_GSET");
1251 /* Supported features. */
1253 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1254 *supported |= OFPPF_10MB_HD;
1256 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1257 *supported |= OFPPF_10MB_FD;
1259 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1260 *supported |= OFPPF_100MB_HD;
1262 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1263 *supported |= OFPPF_100MB_FD;
1265 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1266 *supported |= OFPPF_1GB_HD;
1268 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1269 *supported |= OFPPF_1GB_FD;
1271 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1272 *supported |= OFPPF_10GB_FD;
1274 if (ecmd.supported & SUPPORTED_TP) {
1275 *supported |= OFPPF_COPPER;
1277 if (ecmd.supported & SUPPORTED_FIBRE) {
1278 *supported |= OFPPF_FIBER;
1280 if (ecmd.supported & SUPPORTED_Autoneg) {
1281 *supported |= OFPPF_AUTONEG;
1283 if (ecmd.supported & SUPPORTED_Pause) {
1284 *supported |= OFPPF_PAUSE;
1286 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1287 *supported |= OFPPF_PAUSE_ASYM;
1290 /* Advertised features. */
1292 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1293 *advertised |= OFPPF_10MB_HD;
1295 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1296 *advertised |= OFPPF_10MB_FD;
1298 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1299 *advertised |= OFPPF_100MB_HD;
1301 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1302 *advertised |= OFPPF_100MB_FD;
1304 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1305 *advertised |= OFPPF_1GB_HD;
1307 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1308 *advertised |= OFPPF_1GB_FD;
1310 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1311 *advertised |= OFPPF_10GB_FD;
1313 if (ecmd.advertising & ADVERTISED_TP) {
1314 *advertised |= OFPPF_COPPER;
1316 if (ecmd.advertising & ADVERTISED_FIBRE) {
1317 *advertised |= OFPPF_FIBER;
1319 if (ecmd.advertising & ADVERTISED_Autoneg) {
1320 *advertised |= OFPPF_AUTONEG;
1322 if (ecmd.advertising & ADVERTISED_Pause) {
1323 *advertised |= OFPPF_PAUSE;
1325 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1326 *advertised |= OFPPF_PAUSE_ASYM;
1329 /* Current settings. */
1330 if (ecmd.speed == SPEED_10) {
1331 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1332 } else if (ecmd.speed == SPEED_100) {
1333 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1334 } else if (ecmd.speed == SPEED_1000) {
1335 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1336 } else if (ecmd.speed == SPEED_10000) {
1337 *current = OFPPF_10GB_FD;
1342 if (ecmd.port == PORT_TP) {
1343 *current |= OFPPF_COPPER;
1344 } else if (ecmd.port == PORT_FIBRE) {
1345 *current |= OFPPF_FIBER;
1349 *current |= OFPPF_AUTONEG;
1352 /* Peer advertisements. */
1353 *peer = 0; /* XXX */
1358 /* Set the features advertised by 'netdev' to 'advertise'. */
1360 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1362 struct ethtool_cmd ecmd;
1365 memset(&ecmd, 0, sizeof ecmd);
1366 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1367 ETHTOOL_GSET, "ETHTOOL_GSET");
1372 ecmd.advertising = 0;
1373 if (advertise & OFPPF_10MB_HD) {
1374 ecmd.advertising |= ADVERTISED_10baseT_Half;
1376 if (advertise & OFPPF_10MB_FD) {
1377 ecmd.advertising |= ADVERTISED_10baseT_Full;
1379 if (advertise & OFPPF_100MB_HD) {
1380 ecmd.advertising |= ADVERTISED_100baseT_Half;
1382 if (advertise & OFPPF_100MB_FD) {
1383 ecmd.advertising |= ADVERTISED_100baseT_Full;
1385 if (advertise & OFPPF_1GB_HD) {
1386 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1388 if (advertise & OFPPF_1GB_FD) {
1389 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1391 if (advertise & OFPPF_10GB_FD) {
1392 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1394 if (advertise & OFPPF_COPPER) {
1395 ecmd.advertising |= ADVERTISED_TP;
1397 if (advertise & OFPPF_FIBER) {
1398 ecmd.advertising |= ADVERTISED_FIBRE;
1400 if (advertise & OFPPF_AUTONEG) {
1401 ecmd.advertising |= ADVERTISED_Autoneg;
1403 if (advertise & OFPPF_PAUSE) {
1404 ecmd.advertising |= ADVERTISED_Pause;
1406 if (advertise & OFPPF_PAUSE_ASYM) {
1407 ecmd.advertising |= ADVERTISED_Asym_Pause;
1409 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1410 ETHTOOL_SSET, "ETHTOOL_SSET");
1413 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1414 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1415 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1416 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1417 * sets '*vlan_vid' to -1. */
1419 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1421 const char *netdev_name = netdev_get_name(netdev);
1422 struct ds line = DS_EMPTY_INITIALIZER;
1423 FILE *stream = NULL;
1427 COVERAGE_INC(netdev_get_vlan_vid);
1428 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1429 stream = fopen(fn, "r");
1435 if (ds_get_line(&line, stream)) {
1436 if (ferror(stream)) {
1438 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1441 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1446 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1448 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1449 fn, ds_cstr(&line));
1467 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1468 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1470 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1471 * positive errno value.
1473 * This function is equivalent to running
1474 * /sbin/tc qdisc del dev %s handle ffff: ingress
1475 * but it is much, much faster.
1478 netdev_linux_remove_policing(struct netdev *netdev)
1480 struct netdev_dev_linux *netdev_dev =
1481 netdev_dev_linux_cast(netdev_get_dev(netdev));
1482 const char *netdev_name = netdev_get_name(netdev);
1484 struct ofpbuf request;
1485 struct tcmsg *tcmsg;
1488 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1492 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1493 tcmsg->tcm_parent = TC_H_INGRESS;
1494 nl_msg_put_string(&request, TCA_KIND, "ingress");
1495 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1497 error = tc_transact(&request, NULL);
1498 if (error && error != ENOENT && error != EINVAL) {
1499 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1500 netdev_name, strerror(error));
1504 netdev_dev->kbits_rate = 0;
1505 netdev_dev->kbits_burst = 0;
1506 netdev_dev->cache_valid |= VALID_POLICING;
1510 /* Attempts to set input rate limiting (policing) policy. */
1512 netdev_linux_set_policing(struct netdev *netdev,
1513 uint32_t kbits_rate, uint32_t kbits_burst)
1515 struct netdev_dev_linux *netdev_dev =
1516 netdev_dev_linux_cast(netdev_get_dev(netdev));
1517 const char *netdev_name = netdev_get_name(netdev);
1520 COVERAGE_INC(netdev_set_policing);
1522 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1523 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1524 : kbits_burst); /* Stick with user-specified value. */
1526 if (netdev_dev->cache_valid & VALID_POLICING
1527 && netdev_dev->kbits_rate == kbits_rate
1528 && netdev_dev->kbits_burst == kbits_burst) {
1529 /* Assume that settings haven't changed since we last set them. */
1533 netdev_linux_remove_policing(netdev);
1535 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1536 if (system(command) != 0) {
1537 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1541 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1542 kbits_rate, kbits_burst);
1543 if (system(command) != 0) {
1544 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1549 netdev_dev->kbits_rate = kbits_rate;
1550 netdev_dev->kbits_burst = kbits_burst;
1551 netdev_dev->cache_valid |= VALID_POLICING;
1558 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1561 const struct tc_ops **opsp;
1563 for (opsp = tcs; *opsp != NULL; opsp++) {
1564 const struct tc_ops *ops = *opsp;
1565 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1566 sset_add(types, ops->ovs_name);
1572 static const struct tc_ops *
1573 tc_lookup_ovs_name(const char *name)
1575 const struct tc_ops **opsp;
1577 for (opsp = tcs; *opsp != NULL; opsp++) {
1578 const struct tc_ops *ops = *opsp;
1579 if (!strcmp(name, ops->ovs_name)) {
1586 static const struct tc_ops *
1587 tc_lookup_linux_name(const char *name)
1589 const struct tc_ops **opsp;
1591 for (opsp = tcs; *opsp != NULL; opsp++) {
1592 const struct tc_ops *ops = *opsp;
1593 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1600 static struct tc_queue *
1601 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1604 struct netdev_dev_linux *netdev_dev =
1605 netdev_dev_linux_cast(netdev_get_dev(netdev));
1606 struct tc_queue *queue;
1608 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1609 if (queue->queue_id == queue_id) {
1616 static struct tc_queue *
1617 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1619 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1623 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1625 struct netdev_qos_capabilities *caps)
1627 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1631 caps->n_queues = ops->n_queues;
1636 netdev_linux_get_qos(const struct netdev *netdev,
1637 const char **typep, struct shash *details)
1639 struct netdev_dev_linux *netdev_dev =
1640 netdev_dev_linux_cast(netdev_get_dev(netdev));
1643 error = tc_query_qdisc(netdev);
1648 *typep = netdev_dev->tc->ops->ovs_name;
1649 return (netdev_dev->tc->ops->qdisc_get
1650 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1655 netdev_linux_set_qos(struct netdev *netdev,
1656 const char *type, const struct shash *details)
1658 struct netdev_dev_linux *netdev_dev =
1659 netdev_dev_linux_cast(netdev_get_dev(netdev));
1660 const struct tc_ops *new_ops;
1663 new_ops = tc_lookup_ovs_name(type);
1664 if (!new_ops || !new_ops->tc_install) {
1668 error = tc_query_qdisc(netdev);
1673 if (new_ops == netdev_dev->tc->ops) {
1674 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1676 /* Delete existing qdisc. */
1677 error = tc_del_qdisc(netdev);
1681 assert(netdev_dev->tc == NULL);
1683 /* Install new qdisc. */
1684 error = new_ops->tc_install(netdev, details);
1685 assert((error == 0) == (netdev_dev->tc != NULL));
1692 netdev_linux_get_queue(const struct netdev *netdev,
1693 unsigned int queue_id, struct shash *details)
1695 struct netdev_dev_linux *netdev_dev =
1696 netdev_dev_linux_cast(netdev_get_dev(netdev));
1699 error = tc_query_qdisc(netdev);
1703 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1705 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1711 netdev_linux_set_queue(struct netdev *netdev,
1712 unsigned int queue_id, const struct shash *details)
1714 struct netdev_dev_linux *netdev_dev =
1715 netdev_dev_linux_cast(netdev_get_dev(netdev));
1718 error = tc_query_qdisc(netdev);
1721 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1722 || !netdev_dev->tc->ops->class_set) {
1726 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1730 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1732 struct netdev_dev_linux *netdev_dev =
1733 netdev_dev_linux_cast(netdev_get_dev(netdev));
1736 error = tc_query_qdisc(netdev);
1739 } else if (!netdev_dev->tc->ops->class_delete) {
1742 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1744 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1750 netdev_linux_get_queue_stats(const struct netdev *netdev,
1751 unsigned int queue_id,
1752 struct netdev_queue_stats *stats)
1754 struct netdev_dev_linux *netdev_dev =
1755 netdev_dev_linux_cast(netdev_get_dev(netdev));
1758 error = tc_query_qdisc(netdev);
1761 } else if (!netdev_dev->tc->ops->class_get_stats) {
1764 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1766 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1772 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1774 struct ofpbuf request;
1775 struct tcmsg *tcmsg;
1777 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1781 tcmsg->tcm_parent = 0;
1782 nl_dump_start(dump, rtnl_sock, &request);
1783 ofpbuf_uninit(&request);
1788 netdev_linux_dump_queues(const struct netdev *netdev,
1789 netdev_dump_queues_cb *cb, void *aux)
1791 struct netdev_dev_linux *netdev_dev =
1792 netdev_dev_linux_cast(netdev_get_dev(netdev));
1793 struct tc_queue *queue;
1794 struct shash details;
1798 error = tc_query_qdisc(netdev);
1801 } else if (!netdev_dev->tc->ops->class_get) {
1806 shash_init(&details);
1807 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1808 shash_clear(&details);
1810 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1812 (*cb)(queue->queue_id, &details, aux);
1817 shash_destroy(&details);
1823 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1824 netdev_dump_queue_stats_cb *cb, void *aux)
1826 struct netdev_dev_linux *netdev_dev =
1827 netdev_dev_linux_cast(netdev_get_dev(netdev));
1828 struct nl_dump dump;
1833 error = tc_query_qdisc(netdev);
1836 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1841 if (!start_queue_dump(netdev, &dump)) {
1844 while (nl_dump_next(&dump, &msg)) {
1845 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1851 error = nl_dump_done(&dump);
1852 return error ? error : last_error;
1856 netdev_linux_get_in4(const struct netdev *netdev_,
1857 struct in_addr *address, struct in_addr *netmask)
1859 struct netdev_dev_linux *netdev_dev =
1860 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1862 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1865 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1866 SIOCGIFADDR, "SIOCGIFADDR");
1871 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1872 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1877 netdev_dev->cache_valid |= VALID_IN4;
1879 *address = netdev_dev->address;
1880 *netmask = netdev_dev->netmask;
1881 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1885 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1886 struct in_addr netmask)
1888 struct netdev_dev_linux *netdev_dev =
1889 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1892 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1894 netdev_dev->cache_valid |= VALID_IN4;
1895 netdev_dev->address = address;
1896 netdev_dev->netmask = netmask;
1897 if (address.s_addr != INADDR_ANY) {
1898 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1899 "SIOCSIFNETMASK", netmask);
1906 parse_if_inet6_line(const char *line,
1907 struct in6_addr *in6, char ifname[16 + 1])
1909 uint8_t *s6 = in6->s6_addr;
1910 #define X8 "%2"SCNx8
1912 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1913 "%*x %*x %*x %*x %16s\n",
1914 &s6[0], &s6[1], &s6[2], &s6[3],
1915 &s6[4], &s6[5], &s6[6], &s6[7],
1916 &s6[8], &s6[9], &s6[10], &s6[11],
1917 &s6[12], &s6[13], &s6[14], &s6[15],
1921 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1922 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1924 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1926 struct netdev_dev_linux *netdev_dev =
1927 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1928 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1932 netdev_dev->in6 = in6addr_any;
1934 file = fopen("/proc/net/if_inet6", "r");
1936 const char *name = netdev_get_name(netdev_);
1937 while (fgets(line, sizeof line, file)) {
1938 struct in6_addr in6_tmp;
1939 char ifname[16 + 1];
1940 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1941 && !strcmp(name, ifname))
1943 netdev_dev->in6 = in6_tmp;
1949 netdev_dev->cache_valid |= VALID_IN6;
1951 *in6 = netdev_dev->in6;
1956 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1958 struct sockaddr_in sin;
1959 memset(&sin, 0, sizeof sin);
1960 sin.sin_family = AF_INET;
1961 sin.sin_addr = addr;
1964 memset(sa, 0, sizeof *sa);
1965 memcpy(sa, &sin, sizeof sin);
1969 do_set_addr(struct netdev *netdev,
1970 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1973 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1974 make_in4_sockaddr(&ifr.ifr_addr, addr);
1976 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1980 /* Adds 'router' as a default IP gateway. */
1982 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1984 struct in_addr any = { INADDR_ANY };
1988 memset(&rt, 0, sizeof rt);
1989 make_in4_sockaddr(&rt.rt_dst, any);
1990 make_in4_sockaddr(&rt.rt_gateway, router);
1991 make_in4_sockaddr(&rt.rt_genmask, any);
1992 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1993 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1995 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2001 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2004 static const char fn[] = "/proc/net/route";
2009 *netdev_name = NULL;
2010 stream = fopen(fn, "r");
2011 if (stream == NULL) {
2012 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2017 while (fgets(line, sizeof line, stream)) {
2020 uint32_t dest, gateway, mask;
2021 int refcnt, metric, mtu;
2022 unsigned int flags, use, window, irtt;
2025 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2027 iface, &dest, &gateway, &flags, &refcnt,
2028 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2030 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2034 if (!(flags & RTF_UP)) {
2035 /* Skip routes that aren't up. */
2039 /* The output of 'dest', 'mask', and 'gateway' were given in
2040 * network byte order, so we don't need need any endian
2041 * conversions here. */
2042 if ((dest & mask) == (host->s_addr & mask)) {
2044 /* The host is directly reachable. */
2045 next_hop->s_addr = 0;
2047 /* To reach the host, we must go through a gateway. */
2048 next_hop->s_addr = gateway;
2050 *netdev_name = xstrdup(iface);
2062 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2064 struct ethtool_drvinfo drvinfo;
2067 memset(&drvinfo, 0, sizeof drvinfo);
2068 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2069 (struct ethtool_cmd *)&drvinfo,
2071 "ETHTOOL_GDRVINFO");
2073 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2074 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2075 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2081 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2082 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2083 * returns 0. Otherwise, it returns a positive errno value; in particular,
2084 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2086 netdev_linux_arp_lookup(const struct netdev *netdev,
2087 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2090 struct sockaddr_in sin;
2093 memset(&r, 0, sizeof r);
2094 memset(&sin, 0, sizeof sin);
2095 sin.sin_family = AF_INET;
2096 sin.sin_addr.s_addr = ip;
2098 memcpy(&r.arp_pa, &sin, sizeof sin);
2099 r.arp_ha.sa_family = ARPHRD_ETHER;
2101 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2102 COVERAGE_INC(netdev_arp_lookup);
2103 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2105 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2106 } else if (retval != ENXIO) {
2107 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2108 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2114 nd_to_iff_flags(enum netdev_flags nd)
2117 if (nd & NETDEV_UP) {
2120 if (nd & NETDEV_PROMISC) {
2127 iff_to_nd_flags(int iff)
2129 enum netdev_flags nd = 0;
2133 if (iff & IFF_PROMISC) {
2134 nd |= NETDEV_PROMISC;
2140 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2141 enum netdev_flags on, enum netdev_flags *old_flagsp)
2143 int old_flags, new_flags;
2146 error = get_flags(netdev, &old_flags);
2148 *old_flagsp = iff_to_nd_flags(old_flags);
2149 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2150 if (new_flags != old_flags) {
2151 error = set_flags(netdev, new_flags);
2158 poll_notify(struct list *list)
2160 struct netdev_linux_notifier *notifier;
2161 LIST_FOR_EACH (notifier, node, list) {
2162 struct netdev_notifier *n = ¬ifier->notifier;
2168 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2169 void *aux OVS_UNUSED)
2172 struct list *list = shash_find_data(&netdev_linux_notifiers,
2178 struct shash_node *node;
2179 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2180 poll_notify(node->data);
2186 netdev_linux_poll_add(struct netdev *netdev,
2187 void (*cb)(struct netdev_notifier *), void *aux,
2188 struct netdev_notifier **notifierp)
2190 const char *netdev_name = netdev_get_name(netdev);
2191 struct netdev_linux_notifier *notifier;
2194 if (shash_is_empty(&netdev_linux_notifiers)) {
2196 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2197 netdev_linux_poll_cb, NULL);
2203 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2205 list = xmalloc(sizeof *list);
2207 shash_add(&netdev_linux_notifiers, netdev_name, list);
2210 notifier = xmalloc(sizeof *notifier);
2211 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2212 list_push_back(list, ¬ifier->node);
2213 *notifierp = ¬ifier->notifier;
2218 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2220 struct netdev_linux_notifier *notifier =
2221 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2224 /* Remove 'notifier' from its list. */
2225 list = list_remove(¬ifier->node);
2226 if (list_is_empty(list)) {
2227 /* The list is now empty. Remove it from the hash and free it. */
2228 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2229 shash_delete(&netdev_linux_notifiers,
2230 shash_find(&netdev_linux_notifiers, netdev_name));
2235 /* If that was the last notifier, unregister. */
2236 if (shash_is_empty(&netdev_linux_notifiers)) {
2237 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2241 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2245 netdev_linux_init, \
2247 netdev_linux_wait, \
2250 netdev_linux_destroy, \
2251 NULL, /* set_config */ \
2253 netdev_linux_open, \
2254 netdev_linux_close, \
2258 netdev_linux_recv, \
2259 netdev_linux_recv_wait, \
2260 netdev_linux_drain, \
2262 netdev_linux_send, \
2263 netdev_linux_send_wait, \
2265 netdev_linux_set_etheraddr, \
2266 netdev_linux_get_etheraddr, \
2267 netdev_linux_get_mtu, \
2268 netdev_linux_get_ifindex, \
2269 netdev_linux_get_carrier, \
2270 netdev_linux_get_miimon, \
2271 netdev_linux_get_stats, \
2274 netdev_linux_get_features, \
2275 netdev_linux_set_advertisements, \
2276 netdev_linux_get_vlan_vid, \
2278 netdev_linux_set_policing, \
2279 netdev_linux_get_qos_types, \
2280 netdev_linux_get_qos_capabilities, \
2281 netdev_linux_get_qos, \
2282 netdev_linux_set_qos, \
2283 netdev_linux_get_queue, \
2284 netdev_linux_set_queue, \
2285 netdev_linux_delete_queue, \
2286 netdev_linux_get_queue_stats, \
2287 netdev_linux_dump_queues, \
2288 netdev_linux_dump_queue_stats, \
2290 netdev_linux_get_in4, \
2291 netdev_linux_set_in4, \
2292 netdev_linux_get_in6, \
2293 netdev_linux_add_router, \
2294 netdev_linux_get_next_hop, \
2295 netdev_linux_get_status, \
2296 netdev_linux_arp_lookup, \
2298 netdev_linux_update_flags, \
2300 netdev_linux_poll_add, \
2301 netdev_linux_poll_remove \
2304 const struct netdev_class netdev_linux_class =
2307 netdev_linux_create,
2308 netdev_linux_enumerate,
2309 NULL); /* set_stats */
2311 const struct netdev_class netdev_tap_class =
2314 netdev_linux_create_tap,
2315 NULL, /* enumerate */
2316 NULL); /* set_stats */
2318 const struct netdev_class netdev_internal_class =
2321 netdev_linux_create,
2322 NULL, /* enumerate */
2323 netdev_vport_set_stats);
2325 /* HTB traffic control class. */
2327 #define HTB_N_QUEUES 0xf000
2331 unsigned int max_rate; /* In bytes/s. */
2335 struct tc_queue tc_queue;
2336 unsigned int min_rate; /* In bytes/s. */
2337 unsigned int max_rate; /* In bytes/s. */
2338 unsigned int burst; /* In bytes. */
2339 unsigned int priority; /* Lower values are higher priorities. */
2343 htb_get__(const struct netdev *netdev)
2345 struct netdev_dev_linux *netdev_dev =
2346 netdev_dev_linux_cast(netdev_get_dev(netdev));
2347 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2351 htb_install__(struct netdev *netdev, uint64_t max_rate)
2353 struct netdev_dev_linux *netdev_dev =
2354 netdev_dev_linux_cast(netdev_get_dev(netdev));
2357 htb = xmalloc(sizeof *htb);
2358 tc_init(&htb->tc, &tc_ops_htb);
2359 htb->max_rate = max_rate;
2361 netdev_dev->tc = &htb->tc;
2364 /* Create an HTB qdisc.
2366 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2368 htb_setup_qdisc__(struct netdev *netdev)
2371 struct tc_htb_glob opt;
2372 struct ofpbuf request;
2373 struct tcmsg *tcmsg;
2375 tc_del_qdisc(netdev);
2377 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2378 NLM_F_EXCL | NLM_F_CREATE, &request);
2382 tcmsg->tcm_handle = tc_make_handle(1, 0);
2383 tcmsg->tcm_parent = TC_H_ROOT;
2385 nl_msg_put_string(&request, TCA_KIND, "htb");
2387 memset(&opt, 0, sizeof opt);
2388 opt.rate2quantum = 10;
2392 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2393 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2394 nl_msg_end_nested(&request, opt_offset);
2396 return tc_transact(&request, NULL);
2399 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2400 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2402 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2403 unsigned int parent, struct htb_class *class)
2406 struct tc_htb_opt opt;
2407 struct ofpbuf request;
2408 struct tcmsg *tcmsg;
2412 netdev_get_mtu(netdev, &mtu);
2413 if (mtu == INT_MAX) {
2414 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2415 netdev_get_name(netdev));
2419 memset(&opt, 0, sizeof opt);
2420 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2421 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2422 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2423 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2424 opt.prio = class->priority;
2426 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2430 tcmsg->tcm_handle = handle;
2431 tcmsg->tcm_parent = parent;
2433 nl_msg_put_string(&request, TCA_KIND, "htb");
2434 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2435 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2436 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2437 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2438 nl_msg_end_nested(&request, opt_offset);
2440 error = tc_transact(&request, NULL);
2442 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2443 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2444 netdev_get_name(netdev),
2445 tc_get_major(handle), tc_get_minor(handle),
2446 tc_get_major(parent), tc_get_minor(parent),
2447 class->min_rate, class->max_rate,
2448 class->burst, class->priority, strerror(error));
2453 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2454 * description of them into 'details'. The description complies with the
2455 * specification given in the vswitch database documentation for linux-htb
2458 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2460 static const struct nl_policy tca_htb_policy[] = {
2461 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2462 .min_len = sizeof(struct tc_htb_opt) },
2465 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2466 const struct tc_htb_opt *htb;
2468 if (!nl_parse_nested(nl_options, tca_htb_policy,
2469 attrs, ARRAY_SIZE(tca_htb_policy))) {
2470 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2474 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2475 class->min_rate = htb->rate.rate;
2476 class->max_rate = htb->ceil.rate;
2477 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2478 class->priority = htb->prio;
2483 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2484 struct htb_class *options,
2485 struct netdev_queue_stats *stats)
2487 struct nlattr *nl_options;
2488 unsigned int handle;
2491 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2492 if (!error && queue_id) {
2493 unsigned int major = tc_get_major(handle);
2494 unsigned int minor = tc_get_minor(handle);
2495 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2496 *queue_id = minor - 1;
2501 if (!error && options) {
2502 error = htb_parse_tca_options__(nl_options, options);
2508 htb_parse_qdisc_details__(struct netdev *netdev,
2509 const struct shash *details, struct htb_class *hc)
2511 const char *max_rate_s;
2513 max_rate_s = shash_find_data(details, "max-rate");
2514 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2515 if (!hc->max_rate) {
2518 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2519 hc->max_rate = netdev_features_to_bps(current) / 8;
2521 hc->min_rate = hc->max_rate;
2527 htb_parse_class_details__(struct netdev *netdev,
2528 const struct shash *details, struct htb_class *hc)
2530 const struct htb *htb = htb_get__(netdev);
2531 const char *min_rate_s = shash_find_data(details, "min-rate");
2532 const char *max_rate_s = shash_find_data(details, "max-rate");
2533 const char *burst_s = shash_find_data(details, "burst");
2534 const char *priority_s = shash_find_data(details, "priority");
2537 netdev_get_mtu(netdev, &mtu);
2538 if (mtu == INT_MAX) {
2539 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2540 netdev_get_name(netdev));
2544 /* HTB requires at least an mtu sized min-rate to send any traffic even
2545 * on uncongested links. */
2546 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2547 hc->min_rate = MAX(hc->min_rate, mtu);
2548 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2551 hc->max_rate = (max_rate_s
2552 ? strtoull(max_rate_s, NULL, 10) / 8
2554 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2555 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2559 * According to hints in the documentation that I've read, it is important
2560 * that 'burst' be at least as big as the largest frame that might be
2561 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2562 * but having it a bit too small is a problem. Since netdev_get_mtu()
2563 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2564 * the MTU. We actually add 64, instead of 14, as a guard against
2565 * additional headers get tacked on somewhere that we're not aware of. */
2566 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2567 hc->burst = MAX(hc->burst, mtu + 64);
2570 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2576 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2577 unsigned int parent, struct htb_class *options,
2578 struct netdev_queue_stats *stats)
2580 struct ofpbuf *reply;
2583 error = tc_query_class(netdev, handle, parent, &reply);
2585 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2586 ofpbuf_delete(reply);
2592 htb_tc_install(struct netdev *netdev, const struct shash *details)
2596 error = htb_setup_qdisc__(netdev);
2598 struct htb_class hc;
2600 htb_parse_qdisc_details__(netdev, details, &hc);
2601 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2602 tc_make_handle(1, 0), &hc);
2604 htb_install__(netdev, hc.max_rate);
2610 static struct htb_class *
2611 htb_class_cast__(const struct tc_queue *queue)
2613 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2617 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2618 const struct htb_class *hc)
2620 struct htb *htb = htb_get__(netdev);
2621 size_t hash = hash_int(queue_id, 0);
2622 struct tc_queue *queue;
2623 struct htb_class *hcp;
2625 queue = tc_find_queue__(netdev, queue_id, hash);
2627 hcp = htb_class_cast__(queue);
2629 hcp = xmalloc(sizeof *hcp);
2630 queue = &hcp->tc_queue;
2631 queue->queue_id = queue_id;
2632 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2635 hcp->min_rate = hc->min_rate;
2636 hcp->max_rate = hc->max_rate;
2637 hcp->burst = hc->burst;
2638 hcp->priority = hc->priority;
2642 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2645 struct nl_dump dump;
2646 struct htb_class hc;
2648 /* Get qdisc options. */
2650 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2651 htb_install__(netdev, hc.max_rate);
2654 if (!start_queue_dump(netdev, &dump)) {
2657 while (nl_dump_next(&dump, &msg)) {
2658 unsigned int queue_id;
2660 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2661 htb_update_queue__(netdev, queue_id, &hc);
2664 nl_dump_done(&dump);
2670 htb_tc_destroy(struct tc *tc)
2672 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2673 struct htb_class *hc, *next;
2675 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2676 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2684 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2686 const struct htb *htb = htb_get__(netdev);
2687 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2692 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2694 struct htb_class hc;
2697 htb_parse_qdisc_details__(netdev, details, &hc);
2698 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2699 tc_make_handle(1, 0), &hc);
2701 htb_get__(netdev)->max_rate = hc.max_rate;
2707 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2708 const struct tc_queue *queue, struct shash *details)
2710 const struct htb_class *hc = htb_class_cast__(queue);
2712 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2713 if (hc->min_rate != hc->max_rate) {
2714 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2716 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2718 shash_add(details, "priority", xasprintf("%u", hc->priority));
2724 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2725 const struct shash *details)
2727 struct htb_class hc;
2730 error = htb_parse_class_details__(netdev, details, &hc);
2735 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2736 tc_make_handle(1, 0xfffe), &hc);
2741 htb_update_queue__(netdev, queue_id, &hc);
2746 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2748 struct htb_class *hc = htb_class_cast__(queue);
2749 struct htb *htb = htb_get__(netdev);
2752 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2754 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2761 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2762 struct netdev_queue_stats *stats)
2764 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2765 tc_make_handle(1, 0xfffe), NULL, stats);
2769 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2770 const struct ofpbuf *nlmsg,
2771 netdev_dump_queue_stats_cb *cb, void *aux)
2773 struct netdev_queue_stats stats;
2774 unsigned int handle, major, minor;
2777 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2782 major = tc_get_major(handle);
2783 minor = tc_get_minor(handle);
2784 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2785 (*cb)(minor - 1, &stats, aux);
2790 static const struct tc_ops tc_ops_htb = {
2791 "htb", /* linux_name */
2792 "linux-htb", /* ovs_name */
2793 HTB_N_QUEUES, /* n_queues */
2802 htb_class_get_stats,
2803 htb_class_dump_stats
2806 /* "linux-hfsc" traffic control class. */
2808 #define HFSC_N_QUEUES 0xf000
2816 struct tc_queue tc_queue;
2821 static struct hfsc *
2822 hfsc_get__(const struct netdev *netdev)
2824 struct netdev_dev_linux *netdev_dev;
2825 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2826 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2829 static struct hfsc_class *
2830 hfsc_class_cast__(const struct tc_queue *queue)
2832 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2836 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2838 struct netdev_dev_linux * netdev_dev;
2841 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2842 hfsc = xmalloc(sizeof *hfsc);
2843 tc_init(&hfsc->tc, &tc_ops_hfsc);
2844 hfsc->max_rate = max_rate;
2845 netdev_dev->tc = &hfsc->tc;
2849 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2850 const struct hfsc_class *hc)
2854 struct hfsc_class *hcp;
2855 struct tc_queue *queue;
2857 hfsc = hfsc_get__(netdev);
2858 hash = hash_int(queue_id, 0);
2860 queue = tc_find_queue__(netdev, queue_id, hash);
2862 hcp = hfsc_class_cast__(queue);
2864 hcp = xmalloc(sizeof *hcp);
2865 queue = &hcp->tc_queue;
2866 queue->queue_id = queue_id;
2867 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2870 hcp->min_rate = hc->min_rate;
2871 hcp->max_rate = hc->max_rate;
2875 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2877 const struct tc_service_curve *rsc, *fsc, *usc;
2878 static const struct nl_policy tca_hfsc_policy[] = {
2880 .type = NL_A_UNSPEC,
2882 .min_len = sizeof(struct tc_service_curve),
2885 .type = NL_A_UNSPEC,
2887 .min_len = sizeof(struct tc_service_curve),
2890 .type = NL_A_UNSPEC,
2892 .min_len = sizeof(struct tc_service_curve),
2895 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2897 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2898 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2899 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2903 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2904 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2905 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2907 if (rsc->m1 != 0 || rsc->d != 0 ||
2908 fsc->m1 != 0 || fsc->d != 0 ||
2909 usc->m1 != 0 || usc->d != 0) {
2910 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2911 "Non-linear service curves are not supported.");
2915 if (rsc->m2 != fsc->m2) {
2916 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2917 "Real-time service curves are not supported ");
2921 if (rsc->m2 > usc->m2) {
2922 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2923 "Min-rate service curve is greater than "
2924 "the max-rate service curve.");
2928 class->min_rate = fsc->m2;
2929 class->max_rate = usc->m2;
2934 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2935 struct hfsc_class *options,
2936 struct netdev_queue_stats *stats)
2939 unsigned int handle;
2940 struct nlattr *nl_options;
2942 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2948 unsigned int major, minor;
2950 major = tc_get_major(handle);
2951 minor = tc_get_minor(handle);
2952 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2953 *queue_id = minor - 1;
2960 error = hfsc_parse_tca_options__(nl_options, options);
2967 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2968 unsigned int parent, struct hfsc_class *options,
2969 struct netdev_queue_stats *stats)
2972 struct ofpbuf *reply;
2974 error = tc_query_class(netdev, handle, parent, &reply);
2979 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2980 ofpbuf_delete(reply);
2985 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2986 struct hfsc_class *class)
2989 const char *max_rate_s;
2991 max_rate_s = shash_find_data(details, "max-rate");
2992 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2997 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2998 max_rate = netdev_features_to_bps(current) / 8;
3001 class->min_rate = max_rate;
3002 class->max_rate = max_rate;
3006 hfsc_parse_class_details__(struct netdev *netdev,
3007 const struct shash *details,
3008 struct hfsc_class * class)
3010 const struct hfsc *hfsc;
3011 uint32_t min_rate, max_rate;
3012 const char *min_rate_s, *max_rate_s;
3014 hfsc = hfsc_get__(netdev);
3015 min_rate_s = shash_find_data(details, "min-rate");
3016 max_rate_s = shash_find_data(details, "max-rate");
3018 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3019 min_rate = MAX(min_rate, 1);
3020 min_rate = MIN(min_rate, hfsc->max_rate);
3022 max_rate = (max_rate_s
3023 ? strtoull(max_rate_s, NULL, 10) / 8
3025 max_rate = MAX(max_rate, min_rate);
3026 max_rate = MIN(max_rate, hfsc->max_rate);
3028 class->min_rate = min_rate;
3029 class->max_rate = max_rate;
3034 /* Create an HFSC qdisc.
3036 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3038 hfsc_setup_qdisc__(struct netdev * netdev)
3040 struct tcmsg *tcmsg;
3041 struct ofpbuf request;
3042 struct tc_hfsc_qopt opt;
3044 tc_del_qdisc(netdev);
3046 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3047 NLM_F_EXCL | NLM_F_CREATE, &request);
3053 tcmsg->tcm_handle = tc_make_handle(1, 0);
3054 tcmsg->tcm_parent = TC_H_ROOT;
3056 memset(&opt, 0, sizeof opt);
3059 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3060 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3062 return tc_transact(&request, NULL);
3065 /* Create an HFSC class.
3067 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3068 * sc rate <min_rate> ul rate <max_rate>" */
3070 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3071 unsigned int parent, struct hfsc_class *class)
3075 struct tcmsg *tcmsg;
3076 struct ofpbuf request;
3077 struct tc_service_curve min, max;
3079 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3085 tcmsg->tcm_handle = handle;
3086 tcmsg->tcm_parent = parent;
3090 min.m2 = class->min_rate;
3094 max.m2 = class->max_rate;
3096 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3097 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3098 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3099 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3100 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3101 nl_msg_end_nested(&request, opt_offset);
3103 error = tc_transact(&request, NULL);
3105 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3106 "min-rate %ubps, max-rate %ubps (%s)",
3107 netdev_get_name(netdev),
3108 tc_get_major(handle), tc_get_minor(handle),
3109 tc_get_major(parent), tc_get_minor(parent),
3110 class->min_rate, class->max_rate, strerror(error));
3117 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3120 struct hfsc_class class;
3122 error = hfsc_setup_qdisc__(netdev);
3128 hfsc_parse_qdisc_details__(netdev, details, &class);
3129 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3130 tc_make_handle(1, 0), &class);
3136 hfsc_install__(netdev, class.max_rate);
3141 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3144 struct nl_dump dump;
3145 struct hfsc_class hc;
3148 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3149 hfsc_install__(netdev, hc.max_rate);
3151 if (!start_queue_dump(netdev, &dump)) {
3155 while (nl_dump_next(&dump, &msg)) {
3156 unsigned int queue_id;
3158 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3159 hfsc_update_queue__(netdev, queue_id, &hc);
3163 nl_dump_done(&dump);
3168 hfsc_tc_destroy(struct tc *tc)
3171 struct hfsc_class *hc, *next;
3173 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3175 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3176 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3185 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3187 const struct hfsc *hfsc;
3188 hfsc = hfsc_get__(netdev);
3189 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3194 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3197 struct hfsc_class class;
3199 hfsc_parse_qdisc_details__(netdev, details, &class);
3200 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3201 tc_make_handle(1, 0), &class);
3204 hfsc_get__(netdev)->max_rate = class.max_rate;
3211 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3212 const struct tc_queue *queue, struct shash *details)
3214 const struct hfsc_class *hc;
3216 hc = hfsc_class_cast__(queue);
3217 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3218 if (hc->min_rate != hc->max_rate) {
3219 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3225 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3226 const struct shash *details)
3229 struct hfsc_class class;
3231 error = hfsc_parse_class_details__(netdev, details, &class);
3236 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3237 tc_make_handle(1, 0xfffe), &class);
3242 hfsc_update_queue__(netdev, queue_id, &class);
3247 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3251 struct hfsc_class *hc;
3253 hc = hfsc_class_cast__(queue);
3254 hfsc = hfsc_get__(netdev);
3256 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3258 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3265 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3266 struct netdev_queue_stats *stats)
3268 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3269 tc_make_handle(1, 0xfffe), NULL, stats);
3273 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3274 const struct ofpbuf *nlmsg,
3275 netdev_dump_queue_stats_cb *cb, void *aux)
3277 struct netdev_queue_stats stats;
3278 unsigned int handle, major, minor;
3281 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3286 major = tc_get_major(handle);
3287 minor = tc_get_minor(handle);
3288 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3289 (*cb)(minor - 1, &stats, aux);
3294 static const struct tc_ops tc_ops_hfsc = {
3295 "hfsc", /* linux_name */
3296 "linux-hfsc", /* ovs_name */
3297 HFSC_N_QUEUES, /* n_queues */
3298 hfsc_tc_install, /* tc_install */
3299 hfsc_tc_load, /* tc_load */
3300 hfsc_tc_destroy, /* tc_destroy */
3301 hfsc_qdisc_get, /* qdisc_get */
3302 hfsc_qdisc_set, /* qdisc_set */
3303 hfsc_class_get, /* class_get */
3304 hfsc_class_set, /* class_set */
3305 hfsc_class_delete, /* class_delete */
3306 hfsc_class_get_stats, /* class_get_stats */
3307 hfsc_class_dump_stats /* class_dump_stats */
3310 /* "linux-default" traffic control class.
3312 * This class represents the default, unnamed Linux qdisc. It corresponds to
3313 * the "" (empty string) QoS type in the OVS database. */
3316 default_install__(struct netdev *netdev)
3318 struct netdev_dev_linux *netdev_dev =
3319 netdev_dev_linux_cast(netdev_get_dev(netdev));
3320 static struct tc *tc;
3323 tc = xmalloc(sizeof *tc);
3324 tc_init(tc, &tc_ops_default);
3326 netdev_dev->tc = tc;
3330 default_tc_install(struct netdev *netdev,
3331 const struct shash *details OVS_UNUSED)
3333 default_install__(netdev);
3338 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3340 default_install__(netdev);
3344 static const struct tc_ops tc_ops_default = {
3345 NULL, /* linux_name */
3350 NULL, /* tc_destroy */
3351 NULL, /* qdisc_get */
3352 NULL, /* qdisc_set */
3353 NULL, /* class_get */
3354 NULL, /* class_set */
3355 NULL, /* class_delete */
3356 NULL, /* class_get_stats */
3357 NULL /* class_dump_stats */
3360 /* "linux-other" traffic control class.
3365 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3367 struct netdev_dev_linux *netdev_dev =
3368 netdev_dev_linux_cast(netdev_get_dev(netdev));
3369 static struct tc *tc;
3372 tc = xmalloc(sizeof *tc);
3373 tc_init(tc, &tc_ops_other);
3375 netdev_dev->tc = tc;
3379 static const struct tc_ops tc_ops_other = {
3380 NULL, /* linux_name */
3381 "linux-other", /* ovs_name */
3383 NULL, /* tc_install */
3385 NULL, /* tc_destroy */
3386 NULL, /* qdisc_get */
3387 NULL, /* qdisc_set */
3388 NULL, /* class_get */
3389 NULL, /* class_set */
3390 NULL, /* class_delete */
3391 NULL, /* class_get_stats */
3392 NULL /* class_dump_stats */
3395 /* Traffic control. */
3397 /* Number of kernel "tc" ticks per second. */
3398 static double ticks_per_s;
3400 /* Number of kernel "jiffies" per second. This is used for the purpose of
3401 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3402 * one jiffy's worth of data.
3404 * There are two possibilities here:
3406 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3407 * approximate range of 100 to 1024. That means that we really need to
3408 * make sure that the qdisc can buffer that much data.
3410 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3411 * has finely granular timers and there's no need to fudge additional room
3412 * for buffers. (There's no extra effort needed to implement that: the
3413 * large 'buffer_hz' is used as a divisor, so practically any number will
3414 * come out as 0 in the division. Small integer results in the case of
3415 * really high dividends won't have any real effect anyhow.)
3417 static unsigned int buffer_hz;
3419 /* Returns tc handle 'major':'minor'. */
3421 tc_make_handle(unsigned int major, unsigned int minor)
3423 return TC_H_MAKE(major << 16, minor);
3426 /* Returns the major number from 'handle'. */
3428 tc_get_major(unsigned int handle)
3430 return TC_H_MAJ(handle) >> 16;
3433 /* Returns the minor number from 'handle'. */
3435 tc_get_minor(unsigned int handle)
3437 return TC_H_MIN(handle);
3440 static struct tcmsg *
3441 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3442 struct ofpbuf *request)
3444 struct tcmsg *tcmsg;
3448 error = get_ifindex(netdev, &ifindex);
3453 ofpbuf_init(request, 512);
3454 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3455 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3456 tcmsg->tcm_family = AF_UNSPEC;
3457 tcmsg->tcm_ifindex = ifindex;
3458 /* Caller should fill in tcmsg->tcm_handle. */
3459 /* Caller should fill in tcmsg->tcm_parent. */
3465 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3467 int error = nl_sock_transact(rtnl_sock, request, replyp);
3468 ofpbuf_uninit(request);
3475 /* The values in psched are not individually very meaningful, but they are
3476 * important. The tables below show some values seen in the wild.
3480 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3481 * (Before that, there are hints that it was 1000000000.)
3483 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3487 * -----------------------------------
3488 * [1] 000c8000 000f4240 000f4240 00000064
3489 * [2] 000003e8 00000400 000f4240 3b9aca00
3490 * [3] 000003e8 00000400 000f4240 3b9aca00
3491 * [4] 000003e8 00000400 000f4240 00000064
3492 * [5] 000003e8 00000040 000f4240 3b9aca00
3493 * [6] 000003e8 00000040 000f4240 000000f9
3495 * a b c d ticks_per_s buffer_hz
3496 * ------- --------- ---------- ------------- ----------- -------------
3497 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3498 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3499 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3500 * [4] 1,000 1,024 1,000,000 100 976,562 100
3501 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3502 * [6] 1,000 64 1,000,000 249 15,625,000 249
3504 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3505 * [2] 2.6.26-1-686-bigmem from Debian lenny
3506 * [3] 2.6.26-2-sparc64 from Debian lenny
3507 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3508 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3509 * [6] 2.6.34 from kernel.org on KVM
3511 static const char fn[] = "/proc/net/psched";
3512 unsigned int a, b, c, d;
3518 stream = fopen(fn, "r");
3520 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3524 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3525 VLOG_WARN("%s: read failed", fn);
3529 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3533 VLOG_WARN("%s: invalid scheduler parameters", fn);
3537 ticks_per_s = (double) a * c / b;
3541 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3544 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3547 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3548 * rate of 'rate' bytes per second. */
3550 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3555 return (rate * ticks) / ticks_per_s;
3558 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3559 * rate of 'rate' bytes per second. */
3561 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3566 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3569 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3570 * a transmission rate of 'rate' bytes per second. */
3572 tc_buffer_per_jiffy(unsigned int rate)
3577 return rate / buffer_hz;
3580 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3581 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3582 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3583 * stores NULL into it if it is absent.
3585 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3588 * Returns 0 if successful, otherwise a positive errno value. */
3590 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3591 struct nlattr **options)
3593 static const struct nl_policy tca_policy[] = {
3594 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3595 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3597 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3599 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3600 tca_policy, ta, ARRAY_SIZE(ta))) {
3601 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3606 *kind = nl_attr_get_string(ta[TCA_KIND]);
3610 *options = ta[TCA_OPTIONS];
3625 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3626 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3627 * into '*options', and its queue statistics into '*stats'. Any of the output
3628 * arguments may be null.
3630 * Returns 0 if successful, otherwise a positive errno value. */
3632 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3633 struct nlattr **options, struct netdev_queue_stats *stats)
3635 static const struct nl_policy tca_policy[] = {
3636 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3637 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3639 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3641 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3642 tca_policy, ta, ARRAY_SIZE(ta))) {
3643 VLOG_WARN_RL(&rl, "failed to parse class message");
3648 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3649 *handlep = tc->tcm_handle;
3653 *options = ta[TCA_OPTIONS];
3657 const struct gnet_stats_queue *gsq;
3658 struct gnet_stats_basic gsb;
3660 static const struct nl_policy stats_policy[] = {
3661 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3662 .min_len = sizeof gsb },
3663 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3664 .min_len = sizeof *gsq },
3666 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3668 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3669 sa, ARRAY_SIZE(sa))) {
3670 VLOG_WARN_RL(&rl, "failed to parse class stats");
3674 /* Alignment issues screw up the length of struct gnet_stats_basic on
3675 * some arch/bitsize combinations. Newer versions of Linux have a
3676 * struct gnet_stats_basic_packed, but we can't depend on that. The
3677 * easiest thing to do is just to make a copy. */
3678 memset(&gsb, 0, sizeof gsb);
3679 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3680 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3681 stats->tx_bytes = gsb.bytes;
3682 stats->tx_packets = gsb.packets;
3684 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3685 stats->tx_errors = gsq->drops;
3695 memset(stats, 0, sizeof *stats);
3700 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3703 tc_query_class(const struct netdev *netdev,
3704 unsigned int handle, unsigned int parent,
3705 struct ofpbuf **replyp)
3707 struct ofpbuf request;
3708 struct tcmsg *tcmsg;
3711 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3715 tcmsg->tcm_handle = handle;
3716 tcmsg->tcm_parent = parent;
3718 error = tc_transact(&request, replyp);
3720 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3721 netdev_get_name(netdev),
3722 tc_get_major(handle), tc_get_minor(handle),
3723 tc_get_major(parent), tc_get_minor(parent),
3729 /* Equivalent to "tc class del dev <name> handle <handle>". */
3731 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3733 struct ofpbuf request;
3734 struct tcmsg *tcmsg;
3737 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3741 tcmsg->tcm_handle = handle;
3742 tcmsg->tcm_parent = 0;
3744 error = tc_transact(&request, NULL);
3746 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3747 netdev_get_name(netdev),
3748 tc_get_major(handle), tc_get_minor(handle),
3754 /* Equivalent to "tc qdisc del dev <name> root". */
3756 tc_del_qdisc(struct netdev *netdev)
3758 struct netdev_dev_linux *netdev_dev =
3759 netdev_dev_linux_cast(netdev_get_dev(netdev));
3760 struct ofpbuf request;
3761 struct tcmsg *tcmsg;
3764 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3768 tcmsg->tcm_handle = tc_make_handle(1, 0);
3769 tcmsg->tcm_parent = TC_H_ROOT;
3771 error = tc_transact(&request, NULL);
3772 if (error == EINVAL) {
3773 /* EINVAL probably means that the default qdisc was in use, in which
3774 * case we've accomplished our purpose. */
3777 if (!error && netdev_dev->tc) {
3778 if (netdev_dev->tc->ops->tc_destroy) {
3779 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3781 netdev_dev->tc = NULL;
3786 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3787 * kernel to determine what they are. Returns 0 if successful, otherwise a
3788 * positive errno value. */
3790 tc_query_qdisc(const struct netdev *netdev)
3792 struct netdev_dev_linux *netdev_dev =
3793 netdev_dev_linux_cast(netdev_get_dev(netdev));
3794 struct ofpbuf request, *qdisc;
3795 const struct tc_ops *ops;
3796 struct tcmsg *tcmsg;
3800 if (netdev_dev->tc) {
3804 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3805 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3806 * 2.6.35 without that fix backported to it.
3808 * To avoid the OOPS, we must not make a request that would attempt to dump
3809 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3810 * few others. There are a few ways that I can see to do this, but most of
3811 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3812 * technique chosen here is to assume that any non-default qdisc that we
3813 * create will have a class with handle 1:0. The built-in qdiscs only have
3814 * a class with handle 0:0.
3816 * We could check for Linux 2.6.35+ and use a more straightforward method
3818 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3822 tcmsg->tcm_handle = tc_make_handle(1, 0);
3823 tcmsg->tcm_parent = 0;
3825 /* Figure out what tc class to instantiate. */
3826 error = tc_transact(&request, &qdisc);
3830 error = tc_parse_qdisc(qdisc, &kind, NULL);
3832 ops = &tc_ops_other;
3834 ops = tc_lookup_linux_name(kind);
3836 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3837 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3839 ops = &tc_ops_other;
3842 } else if (error == ENOENT) {
3843 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3844 * other entity that doesn't have a handle 1:0. We will assume
3845 * that it's the system default qdisc. */
3846 ops = &tc_ops_default;
3849 /* Who knows? Maybe the device got deleted. */
3850 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3851 netdev_get_name(netdev), strerror(error));
3852 ops = &tc_ops_other;
3855 /* Instantiate it. */
3856 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3857 assert((load_error == 0) == (netdev_dev->tc != NULL));
3858 ofpbuf_delete(qdisc);
3860 return error ? error : load_error;
3863 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3864 approximate the time to transmit packets of various lengths. For an MTU of
3865 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3866 represents two possible packet lengths; for a MTU of 513 through 1024, four
3867 possible lengths; and so on.
3869 Returns, for the specified 'mtu', the number of bits that packet lengths
3870 need to be shifted right to fit within such a 256-entry table. */
3872 tc_calc_cell_log(unsigned int mtu)
3877 mtu = ETH_PAYLOAD_MAX;
3879 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3881 for (cell_log = 0; mtu >= 256; cell_log++) {
3888 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3891 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3893 memset(rate, 0, sizeof *rate);
3894 rate->cell_log = tc_calc_cell_log(mtu);
3895 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3896 /* rate->cell_align = 0; */ /* distro headers. */
3897 rate->mpu = ETH_TOTAL_MIN;
3901 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3902 * attribute of the specified "type".
3904 * See tc_calc_cell_log() above for a description of "rtab"s. */
3906 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3911 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3912 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3913 unsigned packet_size = (i + 1) << rate->cell_log;
3914 if (packet_size < rate->mpu) {
3915 packet_size = rate->mpu;
3917 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3921 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3922 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3923 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3926 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3928 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3929 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3932 /* Public utility functions. */
3934 #define COPY_NETDEV_STATS \
3935 dst->rx_packets = src->rx_packets; \
3936 dst->tx_packets = src->tx_packets; \
3937 dst->rx_bytes = src->rx_bytes; \
3938 dst->tx_bytes = src->tx_bytes; \
3939 dst->rx_errors = src->rx_errors; \
3940 dst->tx_errors = src->tx_errors; \
3941 dst->rx_dropped = src->rx_dropped; \
3942 dst->tx_dropped = src->tx_dropped; \
3943 dst->multicast = src->multicast; \
3944 dst->collisions = src->collisions; \
3945 dst->rx_length_errors = src->rx_length_errors; \
3946 dst->rx_over_errors = src->rx_over_errors; \
3947 dst->rx_crc_errors = src->rx_crc_errors; \
3948 dst->rx_frame_errors = src->rx_frame_errors; \
3949 dst->rx_fifo_errors = src->rx_fifo_errors; \
3950 dst->rx_missed_errors = src->rx_missed_errors; \
3951 dst->tx_aborted_errors = src->tx_aborted_errors; \
3952 dst->tx_carrier_errors = src->tx_carrier_errors; \
3953 dst->tx_fifo_errors = src->tx_fifo_errors; \
3954 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3955 dst->tx_window_errors = src->tx_window_errors
3957 /* Copies 'src' into 'dst', performing format conversion in the process. */
3959 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3960 const struct rtnl_link_stats *src)
3965 /* Copies 'src' into 'dst', performing format conversion in the process. */
3967 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3968 const struct rtnl_link_stats64 *src)
3973 /* Copies 'src' into 'dst', performing format conversion in the process. */
3975 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
3976 const struct netdev_stats *src)
3981 /* Utility functions. */
3984 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3986 /* Policy for RTNLGRP_LINK messages.
3988 * There are *many* more fields in these messages, but currently we only
3989 * care about these fields. */
3990 static const struct nl_policy rtnlgrp_link_policy[] = {
3991 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3992 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3993 .min_len = sizeof(struct rtnl_link_stats) },
3996 struct ofpbuf request;
3997 struct ofpbuf *reply;
3998 struct ifinfomsg *ifi;
3999 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4002 ofpbuf_init(&request, 0);
4003 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4004 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4005 ifi->ifi_family = PF_UNSPEC;
4006 ifi->ifi_index = ifindex;
4007 error = nl_sock_transact(rtnl_sock, &request, &reply);
4008 ofpbuf_uninit(&request);
4013 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4014 rtnlgrp_link_policy,
4015 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4016 ofpbuf_delete(reply);
4020 if (!attrs[IFLA_STATS]) {
4021 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4022 ofpbuf_delete(reply);
4026 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4028 ofpbuf_delete(reply);
4034 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4036 static const char fn[] = "/proc/net/dev";
4041 stream = fopen(fn, "r");
4043 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4048 while (fgets(line, sizeof line, stream)) {
4051 #define X64 "%"SCNu64
4054 X64 X64 X64 X64 X64 X64 X64 "%*u"
4055 X64 X64 X64 X64 X64 X64 X64 "%*u",
4061 &stats->rx_fifo_errors,
4062 &stats->rx_frame_errors,
4068 &stats->tx_fifo_errors,
4070 &stats->tx_carrier_errors) != 15) {
4071 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4072 } else if (!strcmp(devname, netdev_name)) {
4073 stats->rx_length_errors = UINT64_MAX;
4074 stats->rx_over_errors = UINT64_MAX;
4075 stats->rx_crc_errors = UINT64_MAX;
4076 stats->rx_missed_errors = UINT64_MAX;
4077 stats->tx_aborted_errors = UINT64_MAX;
4078 stats->tx_heartbeat_errors = UINT64_MAX;
4079 stats->tx_window_errors = UINT64_MAX;
4085 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4091 get_flags(const struct netdev *netdev, int *flags)
4096 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4098 *flags = ifr.ifr_flags;
4103 set_flags(struct netdev *netdev, int flags)
4107 ifr.ifr_flags = flags;
4108 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4113 do_get_ifindex(const char *netdev_name)
4117 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4118 COVERAGE_INC(netdev_get_ifindex);
4119 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4120 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4121 netdev_name, strerror(errno));
4124 return ifr.ifr_ifindex;
4128 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4130 struct netdev_dev_linux *netdev_dev =
4131 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4133 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4134 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4138 netdev_dev->cache_valid |= VALID_IFINDEX;
4139 netdev_dev->ifindex = ifindex;
4141 *ifindexp = netdev_dev->ifindex;
4146 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4151 memset(&ifr, 0, sizeof ifr);
4152 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4153 COVERAGE_INC(netdev_get_hwaddr);
4154 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4155 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4156 netdev_name, strerror(errno));
4159 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4160 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4161 VLOG_WARN("%s device has unknown hardware address family %d",
4162 netdev_name, hwaddr_family);
4164 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4169 set_etheraddr(const char *netdev_name, int hwaddr_family,
4170 const uint8_t mac[ETH_ADDR_LEN])
4174 memset(&ifr, 0, sizeof ifr);
4175 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4176 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4177 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4178 COVERAGE_INC(netdev_set_hwaddr);
4179 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4180 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4181 netdev_name, strerror(errno));
4188 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4189 int cmd, const char *cmd_name)
4193 memset(&ifr, 0, sizeof ifr);
4194 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4195 ifr.ifr_data = (caddr_t) ecmd;
4198 COVERAGE_INC(netdev_ethtool);
4199 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4202 if (errno != EOPNOTSUPP) {
4203 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4204 "failed: %s", cmd_name, name, strerror(errno));
4206 /* The device doesn't support this operation. That's pretty
4207 * common, so there's no point in logging anything. */
4214 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4215 const char *cmd_name)
4217 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4218 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4219 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4227 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4228 int cmd, const char *cmd_name)
4233 ifr.ifr_addr.sa_family = AF_INET;
4234 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4236 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4237 *ip = sin->sin_addr;