2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/mii.h>
29 #include <linux/pkt_sched.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sockios.h>
32 #include <linux/version.h>
33 #include <sys/types.h>
34 #include <sys/ioctl.h>
35 #include <sys/socket.h>
36 #include <netpacket/packet.h>
37 #include <net/ethernet.h>
39 #include <linux/if_tunnel.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
50 #include "dpif-linux.h"
51 #include "dynamic-string.h"
52 #include "fatal-signal.h"
55 #include "netdev-provider.h"
56 #include "netdev-vport.h"
58 #include "netlink-socket.h"
60 #include "openflow/openflow.h"
62 #include "poll-loop.h"
63 #include "rtnetlink.h"
64 #include "rtnetlink-link.h"
65 #include "socket-util.h"
70 VLOG_DEFINE_THIS_MODULE(netdev_linux);
72 COVERAGE_DEFINE(netdev_get_vlan_vid);
73 COVERAGE_DEFINE(netdev_set_policing);
74 COVERAGE_DEFINE(netdev_arp_lookup);
75 COVERAGE_DEFINE(netdev_get_ifindex);
76 COVERAGE_DEFINE(netdev_get_hwaddr);
77 COVERAGE_DEFINE(netdev_set_hwaddr);
78 COVERAGE_DEFINE(netdev_ethtool);
80 /* These were introduced in Linux 2.6.14, so they might be missing if we have
82 #ifndef ADVERTISED_Pause
83 #define ADVERTISED_Pause (1 << 13)
85 #ifndef ADVERTISED_Asym_Pause
86 #define ADVERTISED_Asym_Pause (1 << 14)
89 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
92 #define TC_RTAB_SIZE 1024
95 static struct rtnetlink_notifier netdev_linux_cache_notifier;
96 static int cache_notifier_refcount;
99 VALID_IFINDEX = 1 << 0,
100 VALID_ETHERADDR = 1 << 1,
104 VALID_CARRIER = 1 << 5,
105 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
106 VALID_POLICING = 1 << 7,
107 VALID_HAVE_VPORT_STATS = 1 << 8
115 /* Traffic control. */
117 /* An instance of a traffic control class. Always associated with a particular
120 * Each TC implementation subclasses this with whatever additional data it
123 const struct tc_ops *ops;
124 struct hmap queues; /* Contains "struct tc_queue"s.
125 * Read by generic TC layer.
126 * Written only by TC implementation. */
129 /* One traffic control queue.
131 * Each TC implementation subclasses this with whatever additional data it
134 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
135 unsigned int queue_id; /* OpenFlow queue ID. */
138 /* A particular kind of traffic control. Each implementation generally maps to
139 * one particular Linux qdisc class.
141 * The functions below return 0 if successful or a positive errno value on
142 * failure, except where otherwise noted. All of them must be provided, except
143 * where otherwise noted. */
145 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
146 * This is null for tc_ops_default and tc_ops_other, for which there are no
147 * appropriate values. */
148 const char *linux_name;
150 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
151 const char *ovs_name;
153 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
154 * queues. The queues are numbered 0 through n_queues - 1. */
155 unsigned int n_queues;
157 /* Called to install this TC class on 'netdev'. The implementation should
158 * make the Netlink calls required to set up 'netdev' with the right qdisc
159 * and configure it according to 'details'. The implementation may assume
160 * that the current qdisc is the default; that is, there is no need for it
161 * to delete the current qdisc before installing itself.
163 * The contents of 'details' should be documented as valid for 'ovs_name'
164 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
165 * (which is built as ovs-vswitchd.conf.db(8)).
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'.
170 * (This function is null for tc_ops_other, which cannot be installed. For
171 * other TC classes it should always be nonnull.) */
172 int (*tc_install)(struct netdev *netdev, const struct shash *details);
174 /* Called when the netdev code determines (through a Netlink query) that
175 * this TC class's qdisc is installed on 'netdev', but we didn't install
176 * it ourselves and so don't know any of the details.
178 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
179 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
180 * implementation should parse the other attributes of 'nlmsg' as
181 * necessary to determine its configuration. If necessary it should also
182 * use Netlink queries to determine the configuration of queues on
185 * This function must return 0 if and only if it sets 'netdev->tc' to an
186 * initialized 'struct tc'. */
187 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
189 /* Destroys the data structures allocated by the implementation as part of
190 * 'tc'. (This includes destroying 'tc->queues' by calling
193 * The implementation should not need to perform any Netlink calls. If
194 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
195 * (But it may not be desirable.)
197 * This function may be null if 'tc' is trivial. */
198 void (*tc_destroy)(struct tc *tc);
200 /* Retrieves details of 'netdev->tc' configuration into 'details'.
202 * The implementation should not need to perform any Netlink calls, because
203 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
204 * cached the configuration.
206 * The contents of 'details' should be documented as valid for 'ovs_name'
207 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
208 * (which is built as ovs-vswitchd.conf.db(8)).
210 * This function may be null if 'tc' is not configurable.
212 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
214 /* Reconfigures 'netdev->tc' according to 'details', performing any
215 * required Netlink calls to complete the reconfiguration.
217 * The contents of 'details' should be documented as valid for 'ovs_name'
218 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
219 * (which is built as ovs-vswitchd.conf.db(8)).
221 * This function may be null if 'tc' is not configurable.
223 int (*qdisc_set)(struct netdev *, const struct shash *details);
225 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
226 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
232 * The implementation should not need to perform any Netlink calls, because
233 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
234 * cached the queue configuration.
236 * This function may be null if 'tc' does not have queues ('n_queues' is
238 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
239 struct shash *details);
241 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
242 * 'details', perfoming any required Netlink calls to complete the
243 * reconfiguration. The caller ensures that 'queue_id' is less than
246 * The contents of 'details' should be documented as valid for 'ovs_name'
247 * in the "other_config" column in the "Queue" table in
248 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
250 * This function may be null if 'tc' does not have queues or its queues are
251 * not configurable. */
252 int (*class_set)(struct netdev *, unsigned int queue_id,
253 const struct shash *details);
255 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
256 * tc_queue's within 'netdev->tc->queues'.
258 * This function may be null if 'tc' does not have queues or its queues
259 * cannot be deleted. */
260 int (*class_delete)(struct netdev *, struct tc_queue *queue);
262 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
263 * 'struct tc_queue's within 'netdev->tc->queues'.
265 * On success, initializes '*stats'.
267 * This function may be null if 'tc' does not have queues or if it cannot
268 * report queue statistics. */
269 int (*class_get_stats)(const struct netdev *netdev,
270 const struct tc_queue *queue,
271 struct netdev_queue_stats *stats);
273 /* Extracts queue stats from 'nlmsg', which is a response to a
274 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_dump_stats)(const struct netdev *netdev,
279 const struct ofpbuf *nlmsg,
280 netdev_dump_queue_stats_cb *cb, void *aux);
284 tc_init(struct tc *tc, const struct tc_ops *ops)
287 hmap_init(&tc->queues);
291 tc_destroy(struct tc *tc)
293 hmap_destroy(&tc->queues);
296 static const struct tc_ops tc_ops_htb;
297 static const struct tc_ops tc_ops_hfsc;
298 static const struct tc_ops tc_ops_default;
299 static const struct tc_ops tc_ops_other;
301 static const struct tc_ops *tcs[] = {
302 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
303 &tc_ops_hfsc, /* Hierarchical fair service curve. */
304 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
305 &tc_ops_other, /* Some other qdisc. */
309 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
310 static unsigned int tc_get_major(unsigned int handle);
311 static unsigned int tc_get_minor(unsigned int handle);
313 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
314 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
315 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
317 static struct tcmsg *tc_make_request(const struct netdev *, int type,
318 unsigned int flags, struct ofpbuf *);
319 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
321 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
322 struct nlattr **options);
323 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
324 struct nlattr **options,
325 struct netdev_queue_stats *);
326 static int tc_query_class(const struct netdev *,
327 unsigned int handle, unsigned int parent,
328 struct ofpbuf **replyp);
329 static int tc_delete_class(const struct netdev *, unsigned int handle);
331 static int tc_del_qdisc(struct netdev *netdev);
332 static int tc_query_qdisc(const struct netdev *netdev);
334 static int tc_calc_cell_log(unsigned int mtu);
335 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
336 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
337 const struct tc_ratespec *rate);
338 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
340 struct netdev_dev_linux {
341 struct netdev_dev netdev_dev;
343 struct shash_node *shash_node;
344 unsigned int cache_valid;
346 /* The following are figured out "on demand" only. They are only valid
347 * when the corresponding VALID_* bit in 'cache_valid' is set. */
349 uint8_t etheraddr[ETH_ADDR_LEN];
350 struct in_addr address, netmask;
354 bool is_internal; /* Is this an openvswitch internal device? */
355 bool is_tap; /* Is this a tuntap device? */
356 uint32_t kbits_rate; /* Policing data. */
357 uint32_t kbits_burst;
358 bool have_vport_stats;
362 struct tap_state tap;
366 struct netdev_linux {
367 struct netdev netdev;
371 /* An AF_INET socket (used for ioctl operations). */
372 static int af_inet_sock = -1;
374 /* A Netlink routing socket that is not subscribed to any multicast groups. */
375 static struct nl_sock *rtnl_sock;
377 struct netdev_linux_notifier {
378 struct netdev_notifier notifier;
382 static struct shash netdev_linux_notifiers =
383 SHASH_INITIALIZER(&netdev_linux_notifiers);
384 static struct rtnetlink_notifier netdev_linux_poll_notifier;
386 /* This is set pretty low because we probably won't learn anything from the
387 * additional log messages. */
388 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
390 static int netdev_linux_init(void);
392 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
393 int cmd, const char *cmd_name);
394 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
395 const char *cmd_name);
396 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
397 int cmd, const char *cmd_name);
398 static int get_flags(const struct netdev *, int *flagsp);
399 static int set_flags(struct netdev *, int flags);
400 static int do_get_ifindex(const char *netdev_name);
401 static int get_ifindex(const struct netdev *, int *ifindexp);
402 static int do_set_addr(struct netdev *netdev,
403 int ioctl_nr, const char *ioctl_name,
404 struct in_addr addr);
405 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
406 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
407 const uint8_t[ETH_ADDR_LEN]);
408 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
409 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
412 is_netdev_linux_class(const struct netdev_class *netdev_class)
414 return netdev_class->init == netdev_linux_init;
417 static struct netdev_dev_linux *
418 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
420 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
421 assert(is_netdev_linux_class(netdev_class));
423 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
426 static struct netdev_linux *
427 netdev_linux_cast(const struct netdev *netdev)
429 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
430 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
431 assert(is_netdev_linux_class(netdev_class));
433 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
437 netdev_linux_init(void)
439 static int status = -1;
441 /* Create AF_INET socket. */
442 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
443 status = af_inet_sock >= 0 ? 0 : errno;
445 VLOG_ERR("failed to create inet socket: %s", strerror(status));
448 /* Create rtnetlink socket. */
450 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
452 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
461 netdev_linux_run(void)
463 rtnetlink_link_notifier_run();
467 netdev_linux_wait(void)
469 rtnetlink_link_notifier_wait();
473 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
474 void *aux OVS_UNUSED)
476 struct netdev_dev_linux *dev;
478 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
480 const struct netdev_class *netdev_class =
481 netdev_dev_get_class(base_dev);
483 if (is_netdev_linux_class(netdev_class)) {
484 dev = netdev_dev_linux_cast(base_dev);
485 dev->cache_valid = 0;
489 struct shash device_shash;
490 struct shash_node *node;
492 shash_init(&device_shash);
493 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
494 SHASH_FOR_EACH (node, &device_shash) {
496 dev->cache_valid = 0;
498 shash_destroy(&device_shash);
502 /* Creates system and internal devices. */
504 netdev_linux_create(const struct netdev_class *class,
505 const char *name, const struct shash *args,
506 struct netdev_dev **netdev_devp)
508 struct netdev_dev_linux *netdev_dev;
511 if (!shash_is_empty(args)) {
512 VLOG_WARN("%s: arguments for %s devices should be empty",
516 if (!cache_notifier_refcount) {
517 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
518 netdev_linux_cache_cb, NULL);
523 cache_notifier_refcount++;
525 netdev_dev = xzalloc(sizeof *netdev_dev);
526 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
528 *netdev_devp = &netdev_dev->netdev_dev;
532 /* For most types of netdevs we open the device for each call of
533 * netdev_open(). However, this is not the case with tap devices,
534 * since it is only possible to open the device once. In this
535 * situation we share a single file descriptor, and consequently
536 * buffers, across all readers. Therefore once data is read it will
537 * be unavailable to other reads for tap devices. */
539 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
540 const char *name, const struct shash *args,
541 struct netdev_dev **netdev_devp)
543 struct netdev_dev_linux *netdev_dev;
544 struct tap_state *state;
545 static const char tap_dev[] = "/dev/net/tun";
549 if (!shash_is_empty(args)) {
550 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
553 netdev_dev = xzalloc(sizeof *netdev_dev);
554 state = &netdev_dev->state.tap;
556 /* Open tap device. */
557 state->fd = open(tap_dev, O_RDWR);
560 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
564 /* Create tap device. */
565 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
566 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
567 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
568 VLOG_WARN("%s: creating tap device failed: %s", name,
574 /* Make non-blocking. */
575 error = set_nonblocking(state->fd);
580 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
581 *netdev_devp = &netdev_dev->netdev_dev;
590 destroy_tap(struct netdev_dev_linux *netdev_dev)
592 struct tap_state *state = &netdev_dev->state.tap;
594 if (state->fd >= 0) {
599 /* Destroys the netdev device 'netdev_dev_'. */
601 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
603 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
604 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
606 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
607 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
610 if (class == &netdev_linux_class || class == &netdev_internal_class) {
611 cache_notifier_refcount--;
613 if (!cache_notifier_refcount) {
614 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
616 } else if (class == &netdev_tap_class) {
617 destroy_tap(netdev_dev);
626 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
627 struct netdev **netdevp)
629 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
630 struct netdev_linux *netdev;
631 enum netdev_flags flags;
634 /* Allocate network device. */
635 netdev = xzalloc(sizeof *netdev);
637 netdev_init(&netdev->netdev, netdev_dev_);
639 /* Verify that the device really exists, by attempting to read its flags.
640 * (The flags might be cached, in which case this won't actually do an
643 * Don't do this for "internal" netdevs, though, because those have to be
644 * created as netdev objects before they exist in the kernel, because
645 * creating them in the kernel happens by passing a netdev object to
646 * dpif_port_add(). */
647 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
648 error = netdev_get_flags(&netdev->netdev, &flags);
649 if (error == ENODEV) {
654 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
655 !netdev_dev->state.tap.opened) {
657 /* We assume that the first user of the tap device is the primary user
658 * and give them the tap FD. Subsequent users probably just expect
659 * this to be a system device so open it normally to avoid send/receive
660 * directions appearing to be reversed. */
661 netdev->fd = netdev_dev->state.tap.fd;
662 netdev_dev->state.tap.opened = true;
663 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
664 struct sockaddr_ll sll;
668 /* Create file descriptor. */
669 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
670 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
672 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
673 if (netdev->fd < 0) {
678 /* Set non-blocking mode. */
679 error = set_nonblocking(netdev->fd);
684 /* Get ethernet device index. */
685 error = get_ifindex(&netdev->netdev, &ifindex);
690 /* Bind to specific ethernet device. */
691 memset(&sll, 0, sizeof sll);
692 sll.sll_family = AF_PACKET;
693 sll.sll_ifindex = ifindex;
695 (struct sockaddr *) &sll, sizeof sll) < 0) {
697 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
702 /* Between the socket() and bind() calls above, the socket receives all
703 * packets of the requested type on all system interfaces. We do not
704 * want to receive that data, but there is no way to avoid it. So we
705 * must now drain out the receive queue. */
706 error = drain_rcvbuf(netdev->fd);
712 *netdevp = &netdev->netdev;
716 netdev_uninit(&netdev->netdev, true);
720 /* Closes and destroys 'netdev'. */
722 netdev_linux_close(struct netdev *netdev_)
724 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
726 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
732 /* Initializes 'svec' with a list of the names of all known network devices. */
734 netdev_linux_enumerate(struct svec *svec)
736 struct if_nameindex *names;
738 names = if_nameindex();
742 for (i = 0; names[i].if_name != NULL; i++) {
743 svec_add(svec, names[i].if_name);
745 if_freenameindex(names);
748 VLOG_WARN("could not obtain list of network device names: %s",
755 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
759 if (netdev->fd < 0) {
760 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
765 ssize_t retval = read(netdev->fd, data, size);
768 } else if (errno != EINTR) {
769 if (errno != EAGAIN) {
770 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
771 strerror(errno), netdev_get_name(netdev_));
778 /* Registers with the poll loop to wake up from the next call to poll_block()
779 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
781 netdev_linux_recv_wait(struct netdev *netdev_)
783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
784 if (netdev->fd >= 0) {
785 poll_fd_wait(netdev->fd, POLLIN);
789 /* Discards all packets waiting to be received from 'netdev'. */
791 netdev_linux_drain(struct netdev *netdev_)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 if (netdev->fd < 0) {
796 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
798 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
799 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
803 drain_fd(netdev->fd, ifr.ifr_qlen);
806 return drain_rcvbuf(netdev->fd);
810 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
811 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
812 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
813 * the packet is too big or too small to transmit on the device.
815 * The caller retains ownership of 'buffer' in all cases.
817 * The kernel maintains a packet transmission queue, so the caller is not
818 * expected to do additional queuing of packets. */
820 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
822 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
824 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
826 if (netdev->fd < 0) {
831 ssize_t retval = write(netdev->fd, data, size);
833 /* The Linux AF_PACKET implementation never blocks waiting for room
834 * for packets, instead returning ENOBUFS. Translate this into
835 * EAGAIN for the caller. */
836 if (errno == ENOBUFS) {
838 } else if (errno == EINTR) {
840 } else if (errno != EAGAIN) {
841 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
842 netdev_get_name(netdev_), strerror(errno));
845 } else if (retval != size) {
846 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
847 "%zu) on %s", retval, size, netdev_get_name(netdev_));
855 /* Registers with the poll loop to wake up from the next call to poll_block()
856 * when the packet transmission queue has sufficient room to transmit a packet
857 * with netdev_send().
859 * The kernel maintains a packet transmission queue, so the client is not
860 * expected to do additional queuing of packets. Thus, this function is
861 * unlikely to ever be used. It is included for completeness. */
863 netdev_linux_send_wait(struct netdev *netdev_)
865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
866 if (netdev->fd < 0) {
868 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
869 poll_fd_wait(netdev->fd, POLLOUT);
871 /* TAP device always accepts packets.*/
872 poll_immediate_wake();
876 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
877 * otherwise a positive errno value. */
879 netdev_linux_set_etheraddr(struct netdev *netdev_,
880 const uint8_t mac[ETH_ADDR_LEN])
882 struct netdev_dev_linux *netdev_dev =
883 netdev_dev_linux_cast(netdev_get_dev(netdev_));
886 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
887 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
888 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
890 netdev_dev->cache_valid |= VALID_ETHERADDR;
891 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
899 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
900 * free the returned buffer. */
902 netdev_linux_get_etheraddr(const struct netdev *netdev_,
903 uint8_t mac[ETH_ADDR_LEN])
905 struct netdev_dev_linux *netdev_dev =
906 netdev_dev_linux_cast(netdev_get_dev(netdev_));
907 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
908 int error = get_etheraddr(netdev_get_name(netdev_),
909 netdev_dev->etheraddr);
913 netdev_dev->cache_valid |= VALID_ETHERADDR;
915 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
919 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
920 * in bytes, not including the hardware header; thus, this is typically 1500
921 * bytes for Ethernet devices. */
923 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
925 struct netdev_dev_linux *netdev_dev =
926 netdev_dev_linux_cast(netdev_get_dev(netdev_));
927 if (!(netdev_dev->cache_valid & VALID_MTU)) {
931 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
932 SIOCGIFMTU, "SIOCGIFMTU");
936 netdev_dev->mtu = ifr.ifr_mtu;
937 netdev_dev->cache_valid |= VALID_MTU;
939 *mtup = netdev_dev->mtu;
943 /* Returns the ifindex of 'netdev', if successful, as a positive number.
944 * On failure, returns a negative errno value. */
946 netdev_linux_get_ifindex(const struct netdev *netdev)
950 error = get_ifindex(netdev, &ifindex);
951 return error ? -error : ifindex;
955 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
957 struct netdev_dev_linux *netdev_dev =
958 netdev_dev_linux_cast(netdev_get_dev(netdev_));
963 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
967 fn = xasprintf("/sys/class/net/%s/carrier",
968 netdev_get_name(netdev_));
969 fd = open(fn, O_RDONLY);
972 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
976 retval = read(fd, line, sizeof line);
979 if (error == EINVAL) {
980 /* This is the normal return value when we try to check carrier
981 * if the network device is not up. */
983 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
986 } else if (retval == 0) {
988 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
992 if (line[0] != '0' && line[0] != '1') {
994 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
998 netdev_dev->carrier = line[0] != '0';
999 netdev_dev->cache_valid |= VALID_CARRIER;
1001 *carrier = netdev_dev->carrier;
1013 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1014 const char *cmd_name, struct mii_ioctl_data *data)
1019 memset(&ifr, 0, sizeof ifr);
1020 memcpy(&ifr.ifr_data, data, sizeof *data);
1021 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1022 &ifr, cmd, cmd_name);
1023 memcpy(data, &ifr.ifr_data, sizeof *data);
1029 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1031 const char *name = netdev_get_name(netdev);
1032 struct mii_ioctl_data data;
1037 memset(&data, 0, sizeof data);
1038 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1040 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1041 data.reg_num = MII_BMSR;
1042 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1046 *miimon = !!(data.val_out & BMSR_LSTATUS);
1048 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1051 struct ethtool_cmd ecmd;
1053 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1056 memset(&ecmd, 0, sizeof ecmd);
1057 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1060 struct ethtool_value eval;
1062 memcpy(&eval, &ecmd, sizeof eval);
1063 *miimon = !!eval.data;
1065 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1072 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1073 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1076 check_for_working_netlink_stats(void)
1078 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1079 * preferable, so if that works, we'll use it. */
1080 int ifindex = do_get_ifindex("lo");
1082 VLOG_WARN("failed to get ifindex for lo, "
1083 "obtaining netdev stats from proc");
1086 struct netdev_stats stats;
1087 int error = get_stats_via_netlink(ifindex, &stats);
1089 VLOG_DBG("obtaining netdev stats via rtnetlink");
1092 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1093 "via proc (you are probably running a pre-2.6.19 "
1094 "kernel)", strerror(error));
1100 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1102 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1104 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1105 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1106 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1108 netdev_dev->is_tap = !strcmp(type, "tap");
1109 netdev_dev->is_internal = (!netdev_dev->is_tap
1110 && dpif_linux_is_internal_device(name));
1111 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1116 swap_uint64(uint64_t *a, uint64_t *b)
1123 /* Retrieves current device stats for 'netdev'. */
1125 netdev_linux_get_stats(const struct netdev *netdev_,
1126 struct netdev_stats *stats)
1128 struct netdev_dev_linux *netdev_dev =
1129 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1130 static int use_netlink_stats = -1;
1133 if (netdev_dev->have_vport_stats ||
1134 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1136 error = netdev_vport_get_stats(netdev_, stats);
1137 netdev_dev->have_vport_stats = !error;
1138 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1141 if (!netdev_dev->have_vport_stats) {
1142 if (use_netlink_stats < 0) {
1143 use_netlink_stats = check_for_working_netlink_stats();
1145 if (use_netlink_stats) {
1148 error = get_ifindex(netdev_, &ifindex);
1150 error = get_stats_via_netlink(ifindex, stats);
1153 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1157 /* If this port is an internal port then the transmit and receive stats
1158 * will appear to be swapped relative to the other ports since we are the
1159 * one sending the data, not a remote computer. For consistency, we swap
1160 * them back here. This does not apply if we are getting stats from the
1161 * vport layer because it always tracks stats from the perspective of the
1163 netdev_linux_update_is_pseudo(netdev_dev);
1164 if (!error && !netdev_dev->have_vport_stats &&
1165 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1166 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1167 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1168 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1169 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1170 stats->rx_length_errors = 0;
1171 stats->rx_over_errors = 0;
1172 stats->rx_crc_errors = 0;
1173 stats->rx_frame_errors = 0;
1174 stats->rx_fifo_errors = 0;
1175 stats->rx_missed_errors = 0;
1176 stats->tx_aborted_errors = 0;
1177 stats->tx_carrier_errors = 0;
1178 stats->tx_fifo_errors = 0;
1179 stats->tx_heartbeat_errors = 0;
1180 stats->tx_window_errors = 0;
1186 /* Stores the features supported by 'netdev' into each of '*current',
1187 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1188 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1189 * successful, otherwise a positive errno value. */
1191 netdev_linux_get_features(const struct netdev *netdev,
1192 uint32_t *current, uint32_t *advertised,
1193 uint32_t *supported, uint32_t *peer)
1195 struct ethtool_cmd ecmd;
1198 memset(&ecmd, 0, sizeof ecmd);
1199 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1200 ETHTOOL_GSET, "ETHTOOL_GSET");
1205 /* Supported features. */
1207 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1208 *supported |= OFPPF_10MB_HD;
1210 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1211 *supported |= OFPPF_10MB_FD;
1213 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1214 *supported |= OFPPF_100MB_HD;
1216 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1217 *supported |= OFPPF_100MB_FD;
1219 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1220 *supported |= OFPPF_1GB_HD;
1222 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1223 *supported |= OFPPF_1GB_FD;
1225 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1226 *supported |= OFPPF_10GB_FD;
1228 if (ecmd.supported & SUPPORTED_TP) {
1229 *supported |= OFPPF_COPPER;
1231 if (ecmd.supported & SUPPORTED_FIBRE) {
1232 *supported |= OFPPF_FIBER;
1234 if (ecmd.supported & SUPPORTED_Autoneg) {
1235 *supported |= OFPPF_AUTONEG;
1237 if (ecmd.supported & SUPPORTED_Pause) {
1238 *supported |= OFPPF_PAUSE;
1240 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1241 *supported |= OFPPF_PAUSE_ASYM;
1244 /* Advertised features. */
1246 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1247 *advertised |= OFPPF_10MB_HD;
1249 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1250 *advertised |= OFPPF_10MB_FD;
1252 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1253 *advertised |= OFPPF_100MB_HD;
1255 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1256 *advertised |= OFPPF_100MB_FD;
1258 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1259 *advertised |= OFPPF_1GB_HD;
1261 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1262 *advertised |= OFPPF_1GB_FD;
1264 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1265 *advertised |= OFPPF_10GB_FD;
1267 if (ecmd.advertising & ADVERTISED_TP) {
1268 *advertised |= OFPPF_COPPER;
1270 if (ecmd.advertising & ADVERTISED_FIBRE) {
1271 *advertised |= OFPPF_FIBER;
1273 if (ecmd.advertising & ADVERTISED_Autoneg) {
1274 *advertised |= OFPPF_AUTONEG;
1276 if (ecmd.advertising & ADVERTISED_Pause) {
1277 *advertised |= OFPPF_PAUSE;
1279 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1280 *advertised |= OFPPF_PAUSE_ASYM;
1283 /* Current settings. */
1284 if (ecmd.speed == SPEED_10) {
1285 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1286 } else if (ecmd.speed == SPEED_100) {
1287 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1288 } else if (ecmd.speed == SPEED_1000) {
1289 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1290 } else if (ecmd.speed == SPEED_10000) {
1291 *current = OFPPF_10GB_FD;
1296 if (ecmd.port == PORT_TP) {
1297 *current |= OFPPF_COPPER;
1298 } else if (ecmd.port == PORT_FIBRE) {
1299 *current |= OFPPF_FIBER;
1303 *current |= OFPPF_AUTONEG;
1306 /* Peer advertisements. */
1307 *peer = 0; /* XXX */
1312 /* Set the features advertised by 'netdev' to 'advertise'. */
1314 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1316 struct ethtool_cmd ecmd;
1319 memset(&ecmd, 0, sizeof ecmd);
1320 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1321 ETHTOOL_GSET, "ETHTOOL_GSET");
1326 ecmd.advertising = 0;
1327 if (advertise & OFPPF_10MB_HD) {
1328 ecmd.advertising |= ADVERTISED_10baseT_Half;
1330 if (advertise & OFPPF_10MB_FD) {
1331 ecmd.advertising |= ADVERTISED_10baseT_Full;
1333 if (advertise & OFPPF_100MB_HD) {
1334 ecmd.advertising |= ADVERTISED_100baseT_Half;
1336 if (advertise & OFPPF_100MB_FD) {
1337 ecmd.advertising |= ADVERTISED_100baseT_Full;
1339 if (advertise & OFPPF_1GB_HD) {
1340 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1342 if (advertise & OFPPF_1GB_FD) {
1343 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1345 if (advertise & OFPPF_10GB_FD) {
1346 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1348 if (advertise & OFPPF_COPPER) {
1349 ecmd.advertising |= ADVERTISED_TP;
1351 if (advertise & OFPPF_FIBER) {
1352 ecmd.advertising |= ADVERTISED_FIBRE;
1354 if (advertise & OFPPF_AUTONEG) {
1355 ecmd.advertising |= ADVERTISED_Autoneg;
1357 if (advertise & OFPPF_PAUSE) {
1358 ecmd.advertising |= ADVERTISED_Pause;
1360 if (advertise & OFPPF_PAUSE_ASYM) {
1361 ecmd.advertising |= ADVERTISED_Asym_Pause;
1363 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1364 ETHTOOL_SSET, "ETHTOOL_SSET");
1367 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1368 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1369 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1370 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1371 * sets '*vlan_vid' to -1. */
1373 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1375 const char *netdev_name = netdev_get_name(netdev);
1376 struct ds line = DS_EMPTY_INITIALIZER;
1377 FILE *stream = NULL;
1381 COVERAGE_INC(netdev_get_vlan_vid);
1382 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1383 stream = fopen(fn, "r");
1389 if (ds_get_line(&line, stream)) {
1390 if (ferror(stream)) {
1392 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1395 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1400 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1402 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1403 fn, ds_cstr(&line));
1421 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1422 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1424 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1425 * positive errno value.
1427 * This function is equivalent to running
1428 * /sbin/tc qdisc del dev %s handle ffff: ingress
1429 * but it is much, much faster.
1432 netdev_linux_remove_policing(struct netdev *netdev)
1434 struct netdev_dev_linux *netdev_dev =
1435 netdev_dev_linux_cast(netdev_get_dev(netdev));
1436 const char *netdev_name = netdev_get_name(netdev);
1438 struct ofpbuf request;
1439 struct tcmsg *tcmsg;
1442 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1446 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1447 tcmsg->tcm_parent = TC_H_INGRESS;
1448 nl_msg_put_string(&request, TCA_KIND, "ingress");
1449 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1451 error = tc_transact(&request, NULL);
1452 if (error && error != ENOENT && error != EINVAL) {
1453 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1454 netdev_name, strerror(error));
1458 netdev_dev->kbits_rate = 0;
1459 netdev_dev->kbits_burst = 0;
1460 netdev_dev->cache_valid |= VALID_POLICING;
1464 /* Attempts to set input rate limiting (policing) policy. */
1466 netdev_linux_set_policing(struct netdev *netdev,
1467 uint32_t kbits_rate, uint32_t kbits_burst)
1469 struct netdev_dev_linux *netdev_dev =
1470 netdev_dev_linux_cast(netdev_get_dev(netdev));
1471 const char *netdev_name = netdev_get_name(netdev);
1474 COVERAGE_INC(netdev_set_policing);
1476 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1477 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1478 : kbits_burst); /* Stick with user-specified value. */
1480 if (netdev_dev->cache_valid & VALID_POLICING
1481 && netdev_dev->kbits_rate == kbits_rate
1482 && netdev_dev->kbits_burst == kbits_burst) {
1483 /* Assume that settings haven't changed since we last set them. */
1487 netdev_linux_remove_policing(netdev);
1489 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1490 if (system(command) != 0) {
1491 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1495 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1496 kbits_rate, kbits_burst);
1497 if (system(command) != 0) {
1498 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1503 netdev_dev->kbits_rate = kbits_rate;
1504 netdev_dev->kbits_burst = kbits_burst;
1505 netdev_dev->cache_valid |= VALID_POLICING;
1512 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1515 const struct tc_ops **opsp;
1517 for (opsp = tcs; *opsp != NULL; opsp++) {
1518 const struct tc_ops *ops = *opsp;
1519 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1520 svec_add(types, ops->ovs_name);
1526 static const struct tc_ops *
1527 tc_lookup_ovs_name(const char *name)
1529 const struct tc_ops **opsp;
1531 for (opsp = tcs; *opsp != NULL; opsp++) {
1532 const struct tc_ops *ops = *opsp;
1533 if (!strcmp(name, ops->ovs_name)) {
1540 static const struct tc_ops *
1541 tc_lookup_linux_name(const char *name)
1543 const struct tc_ops **opsp;
1545 for (opsp = tcs; *opsp != NULL; opsp++) {
1546 const struct tc_ops *ops = *opsp;
1547 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1554 static struct tc_queue *
1555 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1558 struct netdev_dev_linux *netdev_dev =
1559 netdev_dev_linux_cast(netdev_get_dev(netdev));
1560 struct tc_queue *queue;
1562 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1563 if (queue->queue_id == queue_id) {
1570 static struct tc_queue *
1571 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1573 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1577 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1579 struct netdev_qos_capabilities *caps)
1581 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1585 caps->n_queues = ops->n_queues;
1590 netdev_linux_get_qos(const struct netdev *netdev,
1591 const char **typep, struct shash *details)
1593 struct netdev_dev_linux *netdev_dev =
1594 netdev_dev_linux_cast(netdev_get_dev(netdev));
1597 error = tc_query_qdisc(netdev);
1602 *typep = netdev_dev->tc->ops->ovs_name;
1603 return (netdev_dev->tc->ops->qdisc_get
1604 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1609 netdev_linux_set_qos(struct netdev *netdev,
1610 const char *type, const struct shash *details)
1612 struct netdev_dev_linux *netdev_dev =
1613 netdev_dev_linux_cast(netdev_get_dev(netdev));
1614 const struct tc_ops *new_ops;
1617 new_ops = tc_lookup_ovs_name(type);
1618 if (!new_ops || !new_ops->tc_install) {
1622 error = tc_query_qdisc(netdev);
1627 if (new_ops == netdev_dev->tc->ops) {
1628 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1630 /* Delete existing qdisc. */
1631 error = tc_del_qdisc(netdev);
1635 assert(netdev_dev->tc == NULL);
1637 /* Install new qdisc. */
1638 error = new_ops->tc_install(netdev, details);
1639 assert((error == 0) == (netdev_dev->tc != NULL));
1646 netdev_linux_get_queue(const struct netdev *netdev,
1647 unsigned int queue_id, struct shash *details)
1649 struct netdev_dev_linux *netdev_dev =
1650 netdev_dev_linux_cast(netdev_get_dev(netdev));
1653 error = tc_query_qdisc(netdev);
1657 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1659 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1665 netdev_linux_set_queue(struct netdev *netdev,
1666 unsigned int queue_id, const struct shash *details)
1668 struct netdev_dev_linux *netdev_dev =
1669 netdev_dev_linux_cast(netdev_get_dev(netdev));
1672 error = tc_query_qdisc(netdev);
1675 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1676 || !netdev_dev->tc->ops->class_set) {
1680 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1684 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1686 struct netdev_dev_linux *netdev_dev =
1687 netdev_dev_linux_cast(netdev_get_dev(netdev));
1690 error = tc_query_qdisc(netdev);
1693 } else if (!netdev_dev->tc->ops->class_delete) {
1696 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1698 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1704 netdev_linux_get_queue_stats(const struct netdev *netdev,
1705 unsigned int queue_id,
1706 struct netdev_queue_stats *stats)
1708 struct netdev_dev_linux *netdev_dev =
1709 netdev_dev_linux_cast(netdev_get_dev(netdev));
1712 error = tc_query_qdisc(netdev);
1715 } else if (!netdev_dev->tc->ops->class_get_stats) {
1718 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1720 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1726 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1728 struct ofpbuf request;
1729 struct tcmsg *tcmsg;
1731 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1735 tcmsg->tcm_parent = 0;
1736 nl_dump_start(dump, rtnl_sock, &request);
1737 ofpbuf_uninit(&request);
1742 netdev_linux_dump_queues(const struct netdev *netdev,
1743 netdev_dump_queues_cb *cb, void *aux)
1745 struct netdev_dev_linux *netdev_dev =
1746 netdev_dev_linux_cast(netdev_get_dev(netdev));
1747 struct tc_queue *queue;
1748 struct shash details;
1752 error = tc_query_qdisc(netdev);
1755 } else if (!netdev_dev->tc->ops->class_get) {
1760 shash_init(&details);
1761 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1762 shash_clear(&details);
1764 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1766 (*cb)(queue->queue_id, &details, aux);
1771 shash_destroy(&details);
1777 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1778 netdev_dump_queue_stats_cb *cb, void *aux)
1780 struct netdev_dev_linux *netdev_dev =
1781 netdev_dev_linux_cast(netdev_get_dev(netdev));
1782 struct nl_dump dump;
1787 error = tc_query_qdisc(netdev);
1790 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1795 if (!start_queue_dump(netdev, &dump)) {
1798 while (nl_dump_next(&dump, &msg)) {
1799 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1805 error = nl_dump_done(&dump);
1806 return error ? error : last_error;
1810 netdev_linux_get_in4(const struct netdev *netdev_,
1811 struct in_addr *address, struct in_addr *netmask)
1813 struct netdev_dev_linux *netdev_dev =
1814 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1816 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1819 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1820 SIOCGIFADDR, "SIOCGIFADDR");
1825 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1826 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1831 netdev_dev->cache_valid |= VALID_IN4;
1833 *address = netdev_dev->address;
1834 *netmask = netdev_dev->netmask;
1835 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1839 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1840 struct in_addr netmask)
1842 struct netdev_dev_linux *netdev_dev =
1843 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1846 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1848 netdev_dev->cache_valid |= VALID_IN4;
1849 netdev_dev->address = address;
1850 netdev_dev->netmask = netmask;
1851 if (address.s_addr != INADDR_ANY) {
1852 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1853 "SIOCSIFNETMASK", netmask);
1860 parse_if_inet6_line(const char *line,
1861 struct in6_addr *in6, char ifname[16 + 1])
1863 uint8_t *s6 = in6->s6_addr;
1864 #define X8 "%2"SCNx8
1866 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1867 "%*x %*x %*x %*x %16s\n",
1868 &s6[0], &s6[1], &s6[2], &s6[3],
1869 &s6[4], &s6[5], &s6[6], &s6[7],
1870 &s6[8], &s6[9], &s6[10], &s6[11],
1871 &s6[12], &s6[13], &s6[14], &s6[15],
1875 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1876 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1878 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1880 struct netdev_dev_linux *netdev_dev =
1881 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1882 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1886 netdev_dev->in6 = in6addr_any;
1888 file = fopen("/proc/net/if_inet6", "r");
1890 const char *name = netdev_get_name(netdev_);
1891 while (fgets(line, sizeof line, file)) {
1892 struct in6_addr in6_tmp;
1893 char ifname[16 + 1];
1894 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1895 && !strcmp(name, ifname))
1897 netdev_dev->in6 = in6_tmp;
1903 netdev_dev->cache_valid |= VALID_IN6;
1905 *in6 = netdev_dev->in6;
1910 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1912 struct sockaddr_in sin;
1913 memset(&sin, 0, sizeof sin);
1914 sin.sin_family = AF_INET;
1915 sin.sin_addr = addr;
1918 memset(sa, 0, sizeof *sa);
1919 memcpy(sa, &sin, sizeof sin);
1923 do_set_addr(struct netdev *netdev,
1924 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1927 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1928 make_in4_sockaddr(&ifr.ifr_addr, addr);
1930 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1934 /* Adds 'router' as a default IP gateway. */
1936 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1938 struct in_addr any = { INADDR_ANY };
1942 memset(&rt, 0, sizeof rt);
1943 make_in4_sockaddr(&rt.rt_dst, any);
1944 make_in4_sockaddr(&rt.rt_gateway, router);
1945 make_in4_sockaddr(&rt.rt_genmask, any);
1946 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1947 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1949 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1955 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1958 static const char fn[] = "/proc/net/route";
1963 *netdev_name = NULL;
1964 stream = fopen(fn, "r");
1965 if (stream == NULL) {
1966 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1971 while (fgets(line, sizeof line, stream)) {
1974 uint32_t dest, gateway, mask;
1975 int refcnt, metric, mtu;
1976 unsigned int flags, use, window, irtt;
1979 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1981 iface, &dest, &gateway, &flags, &refcnt,
1982 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1984 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1988 if (!(flags & RTF_UP)) {
1989 /* Skip routes that aren't up. */
1993 /* The output of 'dest', 'mask', and 'gateway' were given in
1994 * network byte order, so we don't need need any endian
1995 * conversions here. */
1996 if ((dest & mask) == (host->s_addr & mask)) {
1998 /* The host is directly reachable. */
1999 next_hop->s_addr = 0;
2001 /* To reach the host, we must go through a gateway. */
2002 next_hop->s_addr = gateway;
2004 *netdev_name = xstrdup(iface);
2016 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2018 struct ethtool_drvinfo drvinfo;
2021 memset(&drvinfo, 0, sizeof drvinfo);
2022 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2023 (struct ethtool_cmd *)&drvinfo,
2025 "ETHTOOL_GDRVINFO");
2027 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2028 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2029 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2035 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2036 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2037 * returns 0. Otherwise, it returns a positive errno value; in particular,
2038 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2040 netdev_linux_arp_lookup(const struct netdev *netdev,
2041 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2044 struct sockaddr_in sin;
2047 memset(&r, 0, sizeof r);
2048 memset(&sin, 0, sizeof sin);
2049 sin.sin_family = AF_INET;
2050 sin.sin_addr.s_addr = ip;
2052 memcpy(&r.arp_pa, &sin, sizeof sin);
2053 r.arp_ha.sa_family = ARPHRD_ETHER;
2055 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2056 COVERAGE_INC(netdev_arp_lookup);
2057 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2059 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2060 } else if (retval != ENXIO) {
2061 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2062 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2068 nd_to_iff_flags(enum netdev_flags nd)
2071 if (nd & NETDEV_UP) {
2074 if (nd & NETDEV_PROMISC) {
2081 iff_to_nd_flags(int iff)
2083 enum netdev_flags nd = 0;
2087 if (iff & IFF_PROMISC) {
2088 nd |= NETDEV_PROMISC;
2094 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2095 enum netdev_flags on, enum netdev_flags *old_flagsp)
2097 int old_flags, new_flags;
2100 error = get_flags(netdev, &old_flags);
2102 *old_flagsp = iff_to_nd_flags(old_flags);
2103 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2104 if (new_flags != old_flags) {
2105 error = set_flags(netdev, new_flags);
2112 poll_notify(struct list *list)
2114 struct netdev_linux_notifier *notifier;
2115 LIST_FOR_EACH (notifier, node, list) {
2116 struct netdev_notifier *n = ¬ifier->notifier;
2122 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2123 void *aux OVS_UNUSED)
2126 struct list *list = shash_find_data(&netdev_linux_notifiers,
2132 struct shash_node *node;
2133 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2134 poll_notify(node->data);
2140 netdev_linux_poll_add(struct netdev *netdev,
2141 void (*cb)(struct netdev_notifier *), void *aux,
2142 struct netdev_notifier **notifierp)
2144 const char *netdev_name = netdev_get_name(netdev);
2145 struct netdev_linux_notifier *notifier;
2148 if (shash_is_empty(&netdev_linux_notifiers)) {
2150 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2151 netdev_linux_poll_cb, NULL);
2157 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2159 list = xmalloc(sizeof *list);
2161 shash_add(&netdev_linux_notifiers, netdev_name, list);
2164 notifier = xmalloc(sizeof *notifier);
2165 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2166 list_push_back(list, ¬ifier->node);
2167 *notifierp = ¬ifier->notifier;
2172 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2174 struct netdev_linux_notifier *notifier =
2175 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2178 /* Remove 'notifier' from its list. */
2179 list = list_remove(¬ifier->node);
2180 if (list_is_empty(list)) {
2181 /* The list is now empty. Remove it from the hash and free it. */
2182 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2183 shash_delete(&netdev_linux_notifiers,
2184 shash_find(&netdev_linux_notifiers, netdev_name));
2189 /* If that was the last notifier, unregister. */
2190 if (shash_is_empty(&netdev_linux_notifiers)) {
2191 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2195 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2199 netdev_linux_init, \
2201 netdev_linux_wait, \
2204 netdev_linux_destroy, \
2205 NULL, /* set_config */ \
2207 netdev_linux_open, \
2208 netdev_linux_close, \
2212 netdev_linux_recv, \
2213 netdev_linux_recv_wait, \
2214 netdev_linux_drain, \
2216 netdev_linux_send, \
2217 netdev_linux_send_wait, \
2219 netdev_linux_set_etheraddr, \
2220 netdev_linux_get_etheraddr, \
2221 netdev_linux_get_mtu, \
2222 netdev_linux_get_ifindex, \
2223 netdev_linux_get_carrier, \
2224 netdev_linux_get_miimon, \
2225 netdev_linux_get_stats, \
2228 netdev_linux_get_features, \
2229 netdev_linux_set_advertisements, \
2230 netdev_linux_get_vlan_vid, \
2232 netdev_linux_set_policing, \
2233 netdev_linux_get_qos_types, \
2234 netdev_linux_get_qos_capabilities, \
2235 netdev_linux_get_qos, \
2236 netdev_linux_set_qos, \
2237 netdev_linux_get_queue, \
2238 netdev_linux_set_queue, \
2239 netdev_linux_delete_queue, \
2240 netdev_linux_get_queue_stats, \
2241 netdev_linux_dump_queues, \
2242 netdev_linux_dump_queue_stats, \
2244 netdev_linux_get_in4, \
2245 netdev_linux_set_in4, \
2246 netdev_linux_get_in6, \
2247 netdev_linux_add_router, \
2248 netdev_linux_get_next_hop, \
2249 netdev_linux_get_status, \
2250 netdev_linux_arp_lookup, \
2252 netdev_linux_update_flags, \
2254 netdev_linux_poll_add, \
2255 netdev_linux_poll_remove \
2258 const struct netdev_class netdev_linux_class =
2261 netdev_linux_create,
2262 netdev_linux_enumerate,
2263 NULL); /* set_stats */
2265 const struct netdev_class netdev_tap_class =
2268 netdev_linux_create_tap,
2269 NULL, /* enumerate */
2270 NULL); /* set_stats */
2272 const struct netdev_class netdev_internal_class =
2275 netdev_linux_create,
2276 NULL, /* enumerate */
2277 netdev_vport_set_stats);
2279 /* HTB traffic control class. */
2281 #define HTB_N_QUEUES 0xf000
2285 unsigned int max_rate; /* In bytes/s. */
2289 struct tc_queue tc_queue;
2290 unsigned int min_rate; /* In bytes/s. */
2291 unsigned int max_rate; /* In bytes/s. */
2292 unsigned int burst; /* In bytes. */
2293 unsigned int priority; /* Lower values are higher priorities. */
2297 htb_get__(const struct netdev *netdev)
2299 struct netdev_dev_linux *netdev_dev =
2300 netdev_dev_linux_cast(netdev_get_dev(netdev));
2301 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2305 htb_install__(struct netdev *netdev, uint64_t max_rate)
2307 struct netdev_dev_linux *netdev_dev =
2308 netdev_dev_linux_cast(netdev_get_dev(netdev));
2311 htb = xmalloc(sizeof *htb);
2312 tc_init(&htb->tc, &tc_ops_htb);
2313 htb->max_rate = max_rate;
2315 netdev_dev->tc = &htb->tc;
2320 /* Create an HTB qdisc.
2322 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2324 htb_setup_qdisc__(struct netdev *netdev)
2327 struct tc_htb_glob opt;
2328 struct ofpbuf request;
2329 struct tcmsg *tcmsg;
2331 tc_del_qdisc(netdev);
2333 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2334 NLM_F_EXCL | NLM_F_CREATE, &request);
2338 tcmsg->tcm_handle = tc_make_handle(1, 0);
2339 tcmsg->tcm_parent = TC_H_ROOT;
2341 nl_msg_put_string(&request, TCA_KIND, "htb");
2343 memset(&opt, 0, sizeof opt);
2344 opt.rate2quantum = 10;
2348 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2349 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2350 nl_msg_end_nested(&request, opt_offset);
2352 return tc_transact(&request, NULL);
2355 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2356 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2358 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2359 unsigned int parent, struct htb_class *class)
2362 struct tc_htb_opt opt;
2363 struct ofpbuf request;
2364 struct tcmsg *tcmsg;
2368 netdev_get_mtu(netdev, &mtu);
2369 if (mtu == INT_MAX) {
2370 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2371 netdev_get_name(netdev));
2375 memset(&opt, 0, sizeof opt);
2376 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2377 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2378 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2379 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2380 opt.prio = class->priority;
2382 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2386 tcmsg->tcm_handle = handle;
2387 tcmsg->tcm_parent = parent;
2389 nl_msg_put_string(&request, TCA_KIND, "htb");
2390 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2391 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2392 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2393 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2394 nl_msg_end_nested(&request, opt_offset);
2396 error = tc_transact(&request, NULL);
2398 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2399 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2400 netdev_get_name(netdev),
2401 tc_get_major(handle), tc_get_minor(handle),
2402 tc_get_major(parent), tc_get_minor(parent),
2403 class->min_rate, class->max_rate,
2404 class->burst, class->priority, strerror(error));
2409 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2410 * description of them into 'details'. The description complies with the
2411 * specification given in the vswitch database documentation for linux-htb
2414 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2416 static const struct nl_policy tca_htb_policy[] = {
2417 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2418 .min_len = sizeof(struct tc_htb_opt) },
2421 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2422 const struct tc_htb_opt *htb;
2424 if (!nl_parse_nested(nl_options, tca_htb_policy,
2425 attrs, ARRAY_SIZE(tca_htb_policy))) {
2426 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2430 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2431 class->min_rate = htb->rate.rate;
2432 class->max_rate = htb->ceil.rate;
2433 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2434 class->priority = htb->prio;
2439 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2440 struct htb_class *options,
2441 struct netdev_queue_stats *stats)
2443 struct nlattr *nl_options;
2444 unsigned int handle;
2447 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2448 if (!error && queue_id) {
2449 unsigned int major = tc_get_major(handle);
2450 unsigned int minor = tc_get_minor(handle);
2451 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2452 *queue_id = minor - 1;
2457 if (!error && options) {
2458 error = htb_parse_tca_options__(nl_options, options);
2464 htb_parse_qdisc_details__(struct netdev *netdev,
2465 const struct shash *details, struct htb_class *hc)
2467 const char *max_rate_s;
2469 max_rate_s = shash_find_data(details, "max-rate");
2470 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2471 if (!hc->max_rate) {
2474 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2475 hc->max_rate = netdev_features_to_bps(current) / 8;
2477 hc->min_rate = hc->max_rate;
2483 htb_parse_class_details__(struct netdev *netdev,
2484 const struct shash *details, struct htb_class *hc)
2486 const struct htb *htb = htb_get__(netdev);
2487 const char *min_rate_s = shash_find_data(details, "min-rate");
2488 const char *max_rate_s = shash_find_data(details, "max-rate");
2489 const char *burst_s = shash_find_data(details, "burst");
2490 const char *priority_s = shash_find_data(details, "priority");
2493 netdev_get_mtu(netdev, &mtu);
2494 if (mtu == INT_MAX) {
2495 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2496 netdev_get_name(netdev));
2500 /* HTB requires at least an mtu sized min-rate to send any traffic even
2501 * on uncongested links. */
2502 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2503 hc->min_rate = MAX(hc->min_rate, mtu);
2504 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2507 hc->max_rate = (max_rate_s
2508 ? strtoull(max_rate_s, NULL, 10) / 8
2510 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2511 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2515 * According to hints in the documentation that I've read, it is important
2516 * that 'burst' be at least as big as the largest frame that might be
2517 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2518 * but having it a bit too small is a problem. Since netdev_get_mtu()
2519 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2520 * the MTU. We actually add 64, instead of 14, as a guard against
2521 * additional headers get tacked on somewhere that we're not aware of. */
2522 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2523 hc->burst = MAX(hc->burst, mtu + 64);
2526 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2532 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2533 unsigned int parent, struct htb_class *options,
2534 struct netdev_queue_stats *stats)
2536 struct ofpbuf *reply;
2539 error = tc_query_class(netdev, handle, parent, &reply);
2541 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2542 ofpbuf_delete(reply);
2548 htb_tc_install(struct netdev *netdev, const struct shash *details)
2552 error = htb_setup_qdisc__(netdev);
2554 struct htb_class hc;
2556 htb_parse_qdisc_details__(netdev, details, &hc);
2557 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2558 tc_make_handle(1, 0), &hc);
2560 htb_install__(netdev, hc.max_rate);
2566 static struct htb_class *
2567 htb_class_cast__(const struct tc_queue *queue)
2569 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2573 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2574 const struct htb_class *hc)
2576 struct htb *htb = htb_get__(netdev);
2577 size_t hash = hash_int(queue_id, 0);
2578 struct tc_queue *queue;
2579 struct htb_class *hcp;
2581 queue = tc_find_queue__(netdev, queue_id, hash);
2583 hcp = htb_class_cast__(queue);
2585 hcp = xmalloc(sizeof *hcp);
2586 queue = &hcp->tc_queue;
2587 queue->queue_id = queue_id;
2588 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2591 hcp->min_rate = hc->min_rate;
2592 hcp->max_rate = hc->max_rate;
2593 hcp->burst = hc->burst;
2594 hcp->priority = hc->priority;
2598 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2601 struct nl_dump dump;
2602 struct htb_class hc;
2605 /* Get qdisc options. */
2607 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2608 htb = htb_install__(netdev, hc.max_rate);
2611 if (!start_queue_dump(netdev, &dump)) {
2614 while (nl_dump_next(&dump, &msg)) {
2615 unsigned int queue_id;
2617 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2618 htb_update_queue__(netdev, queue_id, &hc);
2621 nl_dump_done(&dump);
2627 htb_tc_destroy(struct tc *tc)
2629 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2630 struct htb_class *hc, *next;
2632 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2633 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2641 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2643 const struct htb *htb = htb_get__(netdev);
2644 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2649 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2651 struct htb_class hc;
2654 htb_parse_qdisc_details__(netdev, details, &hc);
2655 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2656 tc_make_handle(1, 0), &hc);
2658 htb_get__(netdev)->max_rate = hc.max_rate;
2664 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2665 const struct tc_queue *queue, struct shash *details)
2667 const struct htb_class *hc = htb_class_cast__(queue);
2669 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2670 if (hc->min_rate != hc->max_rate) {
2671 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2673 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2675 shash_add(details, "priority", xasprintf("%u", hc->priority));
2681 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2682 const struct shash *details)
2684 struct htb_class hc;
2687 error = htb_parse_class_details__(netdev, details, &hc);
2692 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2693 tc_make_handle(1, 0xfffe), &hc);
2698 htb_update_queue__(netdev, queue_id, &hc);
2703 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2705 struct htb_class *hc = htb_class_cast__(queue);
2706 struct htb *htb = htb_get__(netdev);
2709 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2711 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2718 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2719 struct netdev_queue_stats *stats)
2721 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2722 tc_make_handle(1, 0xfffe), NULL, stats);
2726 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2727 const struct ofpbuf *nlmsg,
2728 netdev_dump_queue_stats_cb *cb, void *aux)
2730 struct netdev_queue_stats stats;
2731 unsigned int handle, major, minor;
2734 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2739 major = tc_get_major(handle);
2740 minor = tc_get_minor(handle);
2741 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2742 (*cb)(minor - 1, &stats, aux);
2747 static const struct tc_ops tc_ops_htb = {
2748 "htb", /* linux_name */
2749 "linux-htb", /* ovs_name */
2750 HTB_N_QUEUES, /* n_queues */
2759 htb_class_get_stats,
2760 htb_class_dump_stats
2763 /* "linux-hfsc" traffic control class. */
2765 #define HFSC_N_QUEUES 0xf000
2773 struct tc_queue tc_queue;
2778 static struct hfsc *
2779 hfsc_get__(const struct netdev *netdev)
2781 struct netdev_dev_linux *netdev_dev;
2782 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2783 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2786 static struct hfsc_class *
2787 hfsc_class_cast__(const struct tc_queue *queue)
2789 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2792 static struct hfsc *
2793 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2795 struct netdev_dev_linux * netdev_dev;
2798 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2799 hfsc = xmalloc(sizeof *hfsc);
2800 tc_init(&hfsc->tc, &tc_ops_hfsc);
2801 hfsc->max_rate = max_rate;
2802 netdev_dev->tc = &hfsc->tc;
2808 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2809 const struct hfsc_class *hc)
2813 struct hfsc_class *hcp;
2814 struct tc_queue *queue;
2816 hfsc = hfsc_get__(netdev);
2817 hash = hash_int(queue_id, 0);
2819 queue = tc_find_queue__(netdev, queue_id, hash);
2821 hcp = hfsc_class_cast__(queue);
2823 hcp = xmalloc(sizeof *hcp);
2824 queue = &hcp->tc_queue;
2825 queue->queue_id = queue_id;
2826 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2829 hcp->min_rate = hc->min_rate;
2830 hcp->max_rate = hc->max_rate;
2834 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2836 const struct tc_service_curve *rsc, *fsc, *usc;
2837 static const struct nl_policy tca_hfsc_policy[] = {
2839 .type = NL_A_UNSPEC,
2841 .min_len = sizeof(struct tc_service_curve),
2844 .type = NL_A_UNSPEC,
2846 .min_len = sizeof(struct tc_service_curve),
2849 .type = NL_A_UNSPEC,
2851 .min_len = sizeof(struct tc_service_curve),
2854 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2856 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2857 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2858 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2862 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2863 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2864 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2866 if (rsc->m1 != 0 || rsc->d != 0 ||
2867 fsc->m1 != 0 || fsc->d != 0 ||
2868 usc->m1 != 0 || usc->d != 0) {
2869 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2870 "Non-linear service curves are not supported.");
2874 if (rsc->m2 != fsc->m2) {
2875 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2876 "Real-time service curves are not supported ");
2880 if (rsc->m2 > usc->m2) {
2881 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2882 "Min-rate service curve is greater than "
2883 "the max-rate service curve.");
2887 class->min_rate = fsc->m2;
2888 class->max_rate = usc->m2;
2893 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2894 struct hfsc_class *options,
2895 struct netdev_queue_stats *stats)
2898 unsigned int handle;
2899 struct nlattr *nl_options;
2901 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2907 unsigned int major, minor;
2909 major = tc_get_major(handle);
2910 minor = tc_get_minor(handle);
2911 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2912 *queue_id = minor - 1;
2919 error = hfsc_parse_tca_options__(nl_options, options);
2926 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2927 unsigned int parent, struct hfsc_class *options,
2928 struct netdev_queue_stats *stats)
2931 struct ofpbuf *reply;
2933 error = tc_query_class(netdev, handle, parent, &reply);
2938 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2939 ofpbuf_delete(reply);
2944 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2945 struct hfsc_class *class)
2948 const char *max_rate_s;
2950 max_rate_s = shash_find_data(details, "max-rate");
2951 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2956 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2957 max_rate = netdev_features_to_bps(current) / 8;
2960 class->min_rate = max_rate;
2961 class->max_rate = max_rate;
2965 hfsc_parse_class_details__(struct netdev *netdev,
2966 const struct shash *details,
2967 struct hfsc_class * class)
2969 const struct hfsc *hfsc;
2970 uint32_t min_rate, max_rate;
2971 const char *min_rate_s, *max_rate_s;
2973 hfsc = hfsc_get__(netdev);
2974 min_rate_s = shash_find_data(details, "min-rate");
2975 max_rate_s = shash_find_data(details, "max-rate");
2977 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2978 min_rate = MAX(min_rate, 1);
2979 min_rate = MIN(min_rate, hfsc->max_rate);
2981 max_rate = (max_rate_s
2982 ? strtoull(max_rate_s, NULL, 10) / 8
2984 max_rate = MAX(max_rate, min_rate);
2985 max_rate = MIN(max_rate, hfsc->max_rate);
2987 class->min_rate = min_rate;
2988 class->max_rate = max_rate;
2993 /* Create an HFSC qdisc.
2995 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2997 hfsc_setup_qdisc__(struct netdev * netdev)
2999 struct tcmsg *tcmsg;
3000 struct ofpbuf request;
3001 struct tc_hfsc_qopt opt;
3003 tc_del_qdisc(netdev);
3005 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3006 NLM_F_EXCL | NLM_F_CREATE, &request);
3012 tcmsg->tcm_handle = tc_make_handle(1, 0);
3013 tcmsg->tcm_parent = TC_H_ROOT;
3015 memset(&opt, 0, sizeof opt);
3018 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3019 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3021 return tc_transact(&request, NULL);
3024 /* Create an HFSC class.
3026 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3027 * sc rate <min_rate> ul rate <max_rate>" */
3029 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3030 unsigned int parent, struct hfsc_class *class)
3034 struct tcmsg *tcmsg;
3035 struct ofpbuf request;
3036 struct tc_service_curve min, max;
3038 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3044 tcmsg->tcm_handle = handle;
3045 tcmsg->tcm_parent = parent;
3049 min.m2 = class->min_rate;
3053 max.m2 = class->max_rate;
3055 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3056 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3057 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3058 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3059 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3060 nl_msg_end_nested(&request, opt_offset);
3062 error = tc_transact(&request, NULL);
3064 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3065 "min-rate %ubps, max-rate %ubps (%s)",
3066 netdev_get_name(netdev),
3067 tc_get_major(handle), tc_get_minor(handle),
3068 tc_get_major(parent), tc_get_minor(parent),
3069 class->min_rate, class->max_rate, strerror(error));
3076 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3079 struct hfsc_class class;
3081 error = hfsc_setup_qdisc__(netdev);
3087 hfsc_parse_qdisc_details__(netdev, details, &class);
3088 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3089 tc_make_handle(1, 0), &class);
3095 hfsc_install__(netdev, class.max_rate);
3100 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3104 struct nl_dump dump;
3105 struct hfsc_class hc;
3108 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3109 hfsc = hfsc_install__(netdev, hc.max_rate);
3111 if (!start_queue_dump(netdev, &dump)) {
3115 while (nl_dump_next(&dump, &msg)) {
3116 unsigned int queue_id;
3118 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3119 hfsc_update_queue__(netdev, queue_id, &hc);
3123 nl_dump_done(&dump);
3128 hfsc_tc_destroy(struct tc *tc)
3131 struct hfsc_class *hc, *next;
3133 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3135 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3136 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3145 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3147 const struct hfsc *hfsc;
3148 hfsc = hfsc_get__(netdev);
3149 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3154 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3157 struct hfsc_class class;
3159 hfsc_parse_qdisc_details__(netdev, details, &class);
3160 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3161 tc_make_handle(1, 0), &class);
3164 hfsc_get__(netdev)->max_rate = class.max_rate;
3171 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3172 const struct tc_queue *queue, struct shash *details)
3174 const struct hfsc_class *hc;
3176 hc = hfsc_class_cast__(queue);
3177 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3178 if (hc->min_rate != hc->max_rate) {
3179 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3185 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3186 const struct shash *details)
3189 struct hfsc_class class;
3191 error = hfsc_parse_class_details__(netdev, details, &class);
3196 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3197 tc_make_handle(1, 0xfffe), &class);
3202 hfsc_update_queue__(netdev, queue_id, &class);
3207 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3211 struct hfsc_class *hc;
3213 hc = hfsc_class_cast__(queue);
3214 hfsc = hfsc_get__(netdev);
3216 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3218 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3225 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3226 struct netdev_queue_stats *stats)
3228 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3229 tc_make_handle(1, 0xfffe), NULL, stats);
3233 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3234 const struct ofpbuf *nlmsg,
3235 netdev_dump_queue_stats_cb *cb, void *aux)
3237 struct netdev_queue_stats stats;
3238 unsigned int handle, major, minor;
3241 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3246 major = tc_get_major(handle);
3247 minor = tc_get_minor(handle);
3248 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3249 (*cb)(minor - 1, &stats, aux);
3254 static const struct tc_ops tc_ops_hfsc = {
3255 "hfsc", /* linux_name */
3256 "linux-hfsc", /* ovs_name */
3257 HFSC_N_QUEUES, /* n_queues */
3258 hfsc_tc_install, /* tc_install */
3259 hfsc_tc_load, /* tc_load */
3260 hfsc_tc_destroy, /* tc_destroy */
3261 hfsc_qdisc_get, /* qdisc_get */
3262 hfsc_qdisc_set, /* qdisc_set */
3263 hfsc_class_get, /* class_get */
3264 hfsc_class_set, /* class_set */
3265 hfsc_class_delete, /* class_delete */
3266 hfsc_class_get_stats, /* class_get_stats */
3267 hfsc_class_dump_stats /* class_dump_stats */
3270 /* "linux-default" traffic control class.
3272 * This class represents the default, unnamed Linux qdisc. It corresponds to
3273 * the "" (empty string) QoS type in the OVS database. */
3276 default_install__(struct netdev *netdev)
3278 struct netdev_dev_linux *netdev_dev =
3279 netdev_dev_linux_cast(netdev_get_dev(netdev));
3280 static struct tc *tc;
3283 tc = xmalloc(sizeof *tc);
3284 tc_init(tc, &tc_ops_default);
3286 netdev_dev->tc = tc;
3290 default_tc_install(struct netdev *netdev,
3291 const struct shash *details OVS_UNUSED)
3293 default_install__(netdev);
3298 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3300 default_install__(netdev);
3304 static const struct tc_ops tc_ops_default = {
3305 NULL, /* linux_name */
3310 NULL, /* tc_destroy */
3311 NULL, /* qdisc_get */
3312 NULL, /* qdisc_set */
3313 NULL, /* class_get */
3314 NULL, /* class_set */
3315 NULL, /* class_delete */
3316 NULL, /* class_get_stats */
3317 NULL /* class_dump_stats */
3320 /* "linux-other" traffic control class.
3325 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3327 struct netdev_dev_linux *netdev_dev =
3328 netdev_dev_linux_cast(netdev_get_dev(netdev));
3329 static struct tc *tc;
3332 tc = xmalloc(sizeof *tc);
3333 tc_init(tc, &tc_ops_other);
3335 netdev_dev->tc = tc;
3339 static const struct tc_ops tc_ops_other = {
3340 NULL, /* linux_name */
3341 "linux-other", /* ovs_name */
3343 NULL, /* tc_install */
3345 NULL, /* tc_destroy */
3346 NULL, /* qdisc_get */
3347 NULL, /* qdisc_set */
3348 NULL, /* class_get */
3349 NULL, /* class_set */
3350 NULL, /* class_delete */
3351 NULL, /* class_get_stats */
3352 NULL /* class_dump_stats */
3355 /* Traffic control. */
3357 /* Number of kernel "tc" ticks per second. */
3358 static double ticks_per_s;
3360 /* Number of kernel "jiffies" per second. This is used for the purpose of
3361 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3362 * one jiffy's worth of data.
3364 * There are two possibilities here:
3366 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3367 * approximate range of 100 to 1024. That means that we really need to
3368 * make sure that the qdisc can buffer that much data.
3370 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3371 * has finely granular timers and there's no need to fudge additional room
3372 * for buffers. (There's no extra effort needed to implement that: the
3373 * large 'buffer_hz' is used as a divisor, so practically any number will
3374 * come out as 0 in the division. Small integer results in the case of
3375 * really high dividends won't have any real effect anyhow.)
3377 static unsigned int buffer_hz;
3379 /* Returns tc handle 'major':'minor'. */
3381 tc_make_handle(unsigned int major, unsigned int minor)
3383 return TC_H_MAKE(major << 16, minor);
3386 /* Returns the major number from 'handle'. */
3388 tc_get_major(unsigned int handle)
3390 return TC_H_MAJ(handle) >> 16;
3393 /* Returns the minor number from 'handle'. */
3395 tc_get_minor(unsigned int handle)
3397 return TC_H_MIN(handle);
3400 static struct tcmsg *
3401 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3402 struct ofpbuf *request)
3404 struct tcmsg *tcmsg;
3408 error = get_ifindex(netdev, &ifindex);
3413 ofpbuf_init(request, 512);
3414 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3415 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3416 tcmsg->tcm_family = AF_UNSPEC;
3417 tcmsg->tcm_ifindex = ifindex;
3418 /* Caller should fill in tcmsg->tcm_handle. */
3419 /* Caller should fill in tcmsg->tcm_parent. */
3425 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3427 int error = nl_sock_transact(rtnl_sock, request, replyp);
3428 ofpbuf_uninit(request);
3435 /* The values in psched are not individually very meaningful, but they are
3436 * important. The tables below show some values seen in the wild.
3440 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3441 * (Before that, there are hints that it was 1000000000.)
3443 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3447 * -----------------------------------
3448 * [1] 000c8000 000f4240 000f4240 00000064
3449 * [2] 000003e8 00000400 000f4240 3b9aca00
3450 * [3] 000003e8 00000400 000f4240 3b9aca00
3451 * [4] 000003e8 00000400 000f4240 00000064
3452 * [5] 000003e8 00000040 000f4240 3b9aca00
3453 * [6] 000003e8 00000040 000f4240 000000f9
3455 * a b c d ticks_per_s buffer_hz
3456 * ------- --------- ---------- ------------- ----------- -------------
3457 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3458 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3459 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3460 * [4] 1,000 1,024 1,000,000 100 976,562 100
3461 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3462 * [6] 1,000 64 1,000,000 249 15,625,000 249
3464 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3465 * [2] 2.6.26-1-686-bigmem from Debian lenny
3466 * [3] 2.6.26-2-sparc64 from Debian lenny
3467 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3468 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3469 * [6] 2.6.34 from kernel.org on KVM
3471 static const char fn[] = "/proc/net/psched";
3472 unsigned int a, b, c, d;
3478 stream = fopen(fn, "r");
3480 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3484 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3485 VLOG_WARN("%s: read failed", fn);
3489 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3493 VLOG_WARN("%s: invalid scheduler parameters", fn);
3497 ticks_per_s = (double) a * c / b;
3501 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3504 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3507 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3508 * rate of 'rate' bytes per second. */
3510 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3515 return (rate * ticks) / ticks_per_s;
3518 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3519 * rate of 'rate' bytes per second. */
3521 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3526 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3529 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3530 * a transmission rate of 'rate' bytes per second. */
3532 tc_buffer_per_jiffy(unsigned int rate)
3537 return rate / buffer_hz;
3540 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3541 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3542 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3543 * stores NULL into it if it is absent.
3545 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3548 * Returns 0 if successful, otherwise a positive errno value. */
3550 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3551 struct nlattr **options)
3553 static const struct nl_policy tca_policy[] = {
3554 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3555 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3557 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3559 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3560 tca_policy, ta, ARRAY_SIZE(ta))) {
3561 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3566 *kind = nl_attr_get_string(ta[TCA_KIND]);
3570 *options = ta[TCA_OPTIONS];
3585 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3586 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3587 * into '*options', and its queue statistics into '*stats'. Any of the output
3588 * arguments may be null.
3590 * Returns 0 if successful, otherwise a positive errno value. */
3592 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3593 struct nlattr **options, struct netdev_queue_stats *stats)
3595 static const struct nl_policy tca_policy[] = {
3596 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3597 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3599 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3601 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3602 tca_policy, ta, ARRAY_SIZE(ta))) {
3603 VLOG_WARN_RL(&rl, "failed to parse class message");
3608 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3609 *handlep = tc->tcm_handle;
3613 *options = ta[TCA_OPTIONS];
3617 const struct gnet_stats_queue *gsq;
3618 struct gnet_stats_basic gsb;
3620 static const struct nl_policy stats_policy[] = {
3621 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3622 .min_len = sizeof gsb },
3623 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3624 .min_len = sizeof *gsq },
3626 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3628 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3629 sa, ARRAY_SIZE(sa))) {
3630 VLOG_WARN_RL(&rl, "failed to parse class stats");
3634 /* Alignment issues screw up the length of struct gnet_stats_basic on
3635 * some arch/bitsize combinations. Newer versions of Linux have a
3636 * struct gnet_stats_basic_packed, but we can't depend on that. The
3637 * easiest thing to do is just to make a copy. */
3638 memset(&gsb, 0, sizeof gsb);
3639 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3640 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3641 stats->tx_bytes = gsb.bytes;
3642 stats->tx_packets = gsb.packets;
3644 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3645 stats->tx_errors = gsq->drops;
3655 memset(stats, 0, sizeof *stats);
3660 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3663 tc_query_class(const struct netdev *netdev,
3664 unsigned int handle, unsigned int parent,
3665 struct ofpbuf **replyp)
3667 struct ofpbuf request;
3668 struct tcmsg *tcmsg;
3671 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3675 tcmsg->tcm_handle = handle;
3676 tcmsg->tcm_parent = parent;
3678 error = tc_transact(&request, replyp);
3680 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3681 netdev_get_name(netdev),
3682 tc_get_major(handle), tc_get_minor(handle),
3683 tc_get_major(parent), tc_get_minor(parent),
3689 /* Equivalent to "tc class del dev <name> handle <handle>". */
3691 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3693 struct ofpbuf request;
3694 struct tcmsg *tcmsg;
3697 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3701 tcmsg->tcm_handle = handle;
3702 tcmsg->tcm_parent = 0;
3704 error = tc_transact(&request, NULL);
3706 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3707 netdev_get_name(netdev),
3708 tc_get_major(handle), tc_get_minor(handle),
3714 /* Equivalent to "tc qdisc del dev <name> root". */
3716 tc_del_qdisc(struct netdev *netdev)
3718 struct netdev_dev_linux *netdev_dev =
3719 netdev_dev_linux_cast(netdev_get_dev(netdev));
3720 struct ofpbuf request;
3721 struct tcmsg *tcmsg;
3724 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3728 tcmsg->tcm_handle = tc_make_handle(1, 0);
3729 tcmsg->tcm_parent = TC_H_ROOT;
3731 error = tc_transact(&request, NULL);
3732 if (error == EINVAL) {
3733 /* EINVAL probably means that the default qdisc was in use, in which
3734 * case we've accomplished our purpose. */
3737 if (!error && netdev_dev->tc) {
3738 if (netdev_dev->tc->ops->tc_destroy) {
3739 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3741 netdev_dev->tc = NULL;
3746 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3747 * kernel to determine what they are. Returns 0 if successful, otherwise a
3748 * positive errno value. */
3750 tc_query_qdisc(const struct netdev *netdev)
3752 struct netdev_dev_linux *netdev_dev =
3753 netdev_dev_linux_cast(netdev_get_dev(netdev));
3754 struct ofpbuf request, *qdisc;
3755 const struct tc_ops *ops;
3756 struct tcmsg *tcmsg;
3760 if (netdev_dev->tc) {
3764 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3765 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3766 * 2.6.35 without that fix backported to it.
3768 * To avoid the OOPS, we must not make a request that would attempt to dump
3769 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3770 * few others. There are a few ways that I can see to do this, but most of
3771 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3772 * technique chosen here is to assume that any non-default qdisc that we
3773 * create will have a class with handle 1:0. The built-in qdiscs only have
3774 * a class with handle 0:0.
3776 * We could check for Linux 2.6.35+ and use a more straightforward method
3778 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3782 tcmsg->tcm_handle = tc_make_handle(1, 0);
3783 tcmsg->tcm_parent = 0;
3785 /* Figure out what tc class to instantiate. */
3786 error = tc_transact(&request, &qdisc);
3790 error = tc_parse_qdisc(qdisc, &kind, NULL);
3792 ops = &tc_ops_other;
3794 ops = tc_lookup_linux_name(kind);
3796 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3797 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3799 ops = &tc_ops_other;
3802 } else if (error == ENOENT) {
3803 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3804 * other entity that doesn't have a handle 1:0. We will assume
3805 * that it's the system default qdisc. */
3806 ops = &tc_ops_default;
3809 /* Who knows? Maybe the device got deleted. */
3810 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3811 netdev_get_name(netdev), strerror(error));
3812 ops = &tc_ops_other;
3815 /* Instantiate it. */
3816 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3817 assert((load_error == 0) == (netdev_dev->tc != NULL));
3818 ofpbuf_delete(qdisc);
3820 return error ? error : load_error;
3823 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3824 approximate the time to transmit packets of various lengths. For an MTU of
3825 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3826 represents two possible packet lengths; for a MTU of 513 through 1024, four
3827 possible lengths; and so on.
3829 Returns, for the specified 'mtu', the number of bits that packet lengths
3830 need to be shifted right to fit within such a 256-entry table. */
3832 tc_calc_cell_log(unsigned int mtu)
3837 mtu = ETH_PAYLOAD_MAX;
3839 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3841 for (cell_log = 0; mtu >= 256; cell_log++) {
3848 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3851 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3853 memset(rate, 0, sizeof *rate);
3854 rate->cell_log = tc_calc_cell_log(mtu);
3855 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3856 /* rate->cell_align = 0; */ /* distro headers. */
3857 rate->mpu = ETH_TOTAL_MIN;
3861 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3862 * attribute of the specified "type".
3864 * See tc_calc_cell_log() above for a description of "rtab"s. */
3866 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3871 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3872 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3873 unsigned packet_size = (i + 1) << rate->cell_log;
3874 if (packet_size < rate->mpu) {
3875 packet_size = rate->mpu;
3877 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3881 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3882 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3883 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3886 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3888 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3889 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3893 /* Utility functions. */
3896 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3898 /* Policy for RTNLGRP_LINK messages.
3900 * There are *many* more fields in these messages, but currently we only
3901 * care about these fields. */
3902 static const struct nl_policy rtnlgrp_link_policy[] = {
3903 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3904 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3905 .min_len = sizeof(struct rtnl_link_stats) },
3908 struct ofpbuf request;
3909 struct ofpbuf *reply;
3910 struct ifinfomsg *ifi;
3911 const struct rtnl_link_stats *rtnl_stats;
3912 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3915 ofpbuf_init(&request, 0);
3916 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3917 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3918 ifi->ifi_family = PF_UNSPEC;
3919 ifi->ifi_index = ifindex;
3920 error = nl_sock_transact(rtnl_sock, &request, &reply);
3921 ofpbuf_uninit(&request);
3926 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3927 rtnlgrp_link_policy,
3928 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3929 ofpbuf_delete(reply);
3933 if (!attrs[IFLA_STATS]) {
3934 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3935 ofpbuf_delete(reply);
3939 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3940 stats->rx_packets = rtnl_stats->rx_packets;
3941 stats->tx_packets = rtnl_stats->tx_packets;
3942 stats->rx_bytes = rtnl_stats->rx_bytes;
3943 stats->tx_bytes = rtnl_stats->tx_bytes;
3944 stats->rx_errors = rtnl_stats->rx_errors;
3945 stats->tx_errors = rtnl_stats->tx_errors;
3946 stats->rx_dropped = rtnl_stats->rx_dropped;
3947 stats->tx_dropped = rtnl_stats->tx_dropped;
3948 stats->multicast = rtnl_stats->multicast;
3949 stats->collisions = rtnl_stats->collisions;
3950 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3951 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3952 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3953 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3954 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3955 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3956 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3957 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3958 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3959 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3960 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3962 ofpbuf_delete(reply);
3968 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3970 static const char fn[] = "/proc/net/dev";
3975 stream = fopen(fn, "r");
3977 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3982 while (fgets(line, sizeof line, stream)) {
3985 #define X64 "%"SCNu64
3988 X64 X64 X64 X64 X64 X64 X64 "%*u"
3989 X64 X64 X64 X64 X64 X64 X64 "%*u",
3995 &stats->rx_fifo_errors,
3996 &stats->rx_frame_errors,
4002 &stats->tx_fifo_errors,
4004 &stats->tx_carrier_errors) != 15) {
4005 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4006 } else if (!strcmp(devname, netdev_name)) {
4007 stats->rx_length_errors = UINT64_MAX;
4008 stats->rx_over_errors = UINT64_MAX;
4009 stats->rx_crc_errors = UINT64_MAX;
4010 stats->rx_missed_errors = UINT64_MAX;
4011 stats->tx_aborted_errors = UINT64_MAX;
4012 stats->tx_heartbeat_errors = UINT64_MAX;
4013 stats->tx_window_errors = UINT64_MAX;
4019 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4025 get_flags(const struct netdev *netdev, int *flags)
4030 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4032 *flags = ifr.ifr_flags;
4037 set_flags(struct netdev *netdev, int flags)
4041 ifr.ifr_flags = flags;
4042 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4047 do_get_ifindex(const char *netdev_name)
4051 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4052 COVERAGE_INC(netdev_get_ifindex);
4053 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4054 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4055 netdev_name, strerror(errno));
4058 return ifr.ifr_ifindex;
4062 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4064 struct netdev_dev_linux *netdev_dev =
4065 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4067 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4068 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4072 netdev_dev->cache_valid |= VALID_IFINDEX;
4073 netdev_dev->ifindex = ifindex;
4075 *ifindexp = netdev_dev->ifindex;
4080 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4085 memset(&ifr, 0, sizeof ifr);
4086 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4087 COVERAGE_INC(netdev_get_hwaddr);
4088 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4089 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4090 netdev_name, strerror(errno));
4093 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4094 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4095 VLOG_WARN("%s device has unknown hardware address family %d",
4096 netdev_name, hwaddr_family);
4098 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4103 set_etheraddr(const char *netdev_name, int hwaddr_family,
4104 const uint8_t mac[ETH_ADDR_LEN])
4108 memset(&ifr, 0, sizeof ifr);
4109 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4110 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4111 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4112 COVERAGE_INC(netdev_set_hwaddr);
4113 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4114 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4115 netdev_name, strerror(errno));
4122 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4123 int cmd, const char *cmd_name)
4127 memset(&ifr, 0, sizeof ifr);
4128 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4129 ifr.ifr_data = (caddr_t) ecmd;
4132 COVERAGE_INC(netdev_ethtool);
4133 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4136 if (errno != EOPNOTSUPP) {
4137 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4138 "failed: %s", cmd_name, name, strerror(errno));
4140 /* The device doesn't support this operation. That's pretty
4141 * common, so there's no point in logging anything. */
4148 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4149 const char *cmd_name)
4151 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4152 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4153 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4161 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4162 int cmd, const char *cmd_name)
4167 ifr.ifr_addr.sa_family = AF_INET;
4168 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4170 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4171 *ip = sin->sin_addr;