2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/mii.h>
29 #include <linux/pkt_sched.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sockios.h>
32 #include <linux/version.h>
33 #include <sys/types.h>
34 #include <sys/ioctl.h>
35 #include <sys/socket.h>
36 #include <netpacket/packet.h>
37 #include <net/ethernet.h>
39 #include <linux/if_tunnel.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
50 #include "dynamic-string.h"
51 #include "fatal-signal.h"
54 #include "netdev-provider.h"
55 #include "netdev-vport.h"
57 #include "netlink-socket.h"
59 #include "openflow/openflow.h"
61 #include "poll-loop.h"
62 #include "rtnetlink.h"
63 #include "rtnetlink-link.h"
64 #include "socket-util.h"
69 VLOG_DEFINE_THIS_MODULE(netdev_linux);
71 COVERAGE_DEFINE(netdev_get_vlan_vid);
72 COVERAGE_DEFINE(netdev_set_policing);
73 COVERAGE_DEFINE(netdev_arp_lookup);
74 COVERAGE_DEFINE(netdev_get_ifindex);
75 COVERAGE_DEFINE(netdev_get_hwaddr);
76 COVERAGE_DEFINE(netdev_set_hwaddr);
77 COVERAGE_DEFINE(netdev_ethtool);
79 /* These were introduced in Linux 2.6.14, so they might be missing if we have
81 #ifndef ADVERTISED_Pause
82 #define ADVERTISED_Pause (1 << 13)
84 #ifndef ADVERTISED_Asym_Pause
85 #define ADVERTISED_Asym_Pause (1 << 14)
88 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
91 #define TC_RTAB_SIZE 1024
94 static struct rtnetlink_notifier netdev_linux_cache_notifier;
95 static int cache_notifier_refcount;
98 VALID_IFINDEX = 1 << 0,
99 VALID_ETHERADDR = 1 << 1,
103 VALID_CARRIER = 1 << 5,
104 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
105 VALID_POLICING = 1 << 7,
106 VALID_HAVE_VPORT_STATS = 1 << 8
114 /* Traffic control. */
116 /* An instance of a traffic control class. Always associated with a particular
119 * Each TC implementation subclasses this with whatever additional data it
122 const struct tc_ops *ops;
123 struct hmap queues; /* Contains "struct tc_queue"s.
124 * Read by generic TC layer.
125 * Written only by TC implementation. */
128 /* One traffic control queue.
130 * Each TC implementation subclasses this with whatever additional data it
133 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
134 unsigned int queue_id; /* OpenFlow queue ID. */
137 /* A particular kind of traffic control. Each implementation generally maps to
138 * one particular Linux qdisc class.
140 * The functions below return 0 if successful or a positive errno value on
141 * failure, except where otherwise noted. All of them must be provided, except
142 * where otherwise noted. */
144 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
145 * This is null for tc_ops_default and tc_ops_other, for which there are no
146 * appropriate values. */
147 const char *linux_name;
149 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
150 const char *ovs_name;
152 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
153 * queues. The queues are numbered 0 through n_queues - 1. */
154 unsigned int n_queues;
156 /* Called to install this TC class on 'netdev'. The implementation should
157 * make the Netlink calls required to set up 'netdev' with the right qdisc
158 * and configure it according to 'details'. The implementation may assume
159 * that the current qdisc is the default; that is, there is no need for it
160 * to delete the current qdisc before installing itself.
162 * The contents of 'details' should be documented as valid for 'ovs_name'
163 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
164 * (which is built as ovs-vswitchd.conf.db(8)).
166 * This function must return 0 if and only if it sets 'netdev->tc' to an
167 * initialized 'struct tc'.
169 * (This function is null for tc_ops_other, which cannot be installed. For
170 * other TC classes it should always be nonnull.) */
171 int (*tc_install)(struct netdev *netdev, const struct shash *details);
173 /* Called when the netdev code determines (through a Netlink query) that
174 * this TC class's qdisc is installed on 'netdev', but we didn't install
175 * it ourselves and so don't know any of the details.
177 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
178 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
179 * implementation should parse the other attributes of 'nlmsg' as
180 * necessary to determine its configuration. If necessary it should also
181 * use Netlink queries to determine the configuration of queues on
184 * This function must return 0 if and only if it sets 'netdev->tc' to an
185 * initialized 'struct tc'. */
186 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
188 /* Destroys the data structures allocated by the implementation as part of
189 * 'tc'. (This includes destroying 'tc->queues' by calling
192 * The implementation should not need to perform any Netlink calls. If
193 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
194 * (But it may not be desirable.)
196 * This function may be null if 'tc' is trivial. */
197 void (*tc_destroy)(struct tc *tc);
199 /* Retrieves details of 'netdev->tc' configuration into 'details'.
201 * The implementation should not need to perform any Netlink calls, because
202 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
203 * cached the configuration.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function may be null if 'tc' is not configurable.
211 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
213 /* Reconfigures 'netdev->tc' according to 'details', performing any
214 * required Netlink calls to complete the reconfiguration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_set)(struct netdev *, const struct shash *details);
224 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
225 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "Queue" table in
229 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
231 * The implementation should not need to perform any Netlink calls, because
232 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
233 * cached the queue configuration.
235 * This function may be null if 'tc' does not have queues ('n_queues' is
237 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
238 struct shash *details);
240 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
241 * 'details', perfoming any required Netlink calls to complete the
242 * reconfiguration. The caller ensures that 'queue_id' is less than
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "Queue" table in
247 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
249 * This function may be null if 'tc' does not have queues or its queues are
250 * not configurable. */
251 int (*class_set)(struct netdev *, unsigned int queue_id,
252 const struct shash *details);
254 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
255 * tc_queue's within 'netdev->tc->queues'.
257 * This function may be null if 'tc' does not have queues or its queues
258 * cannot be deleted. */
259 int (*class_delete)(struct netdev *, struct tc_queue *queue);
261 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
262 * 'struct tc_queue's within 'netdev->tc->queues'.
264 * On success, initializes '*stats'.
266 * This function may be null if 'tc' does not have queues or if it cannot
267 * report queue statistics. */
268 int (*class_get_stats)(const struct netdev *netdev,
269 const struct tc_queue *queue,
270 struct netdev_queue_stats *stats);
272 /* Extracts queue stats from 'nlmsg', which is a response to a
273 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
275 * This function may be null if 'tc' does not have queues or if it cannot
276 * report queue statistics. */
277 int (*class_dump_stats)(const struct netdev *netdev,
278 const struct ofpbuf *nlmsg,
279 netdev_dump_queue_stats_cb *cb, void *aux);
283 tc_init(struct tc *tc, const struct tc_ops *ops)
286 hmap_init(&tc->queues);
290 tc_destroy(struct tc *tc)
292 hmap_destroy(&tc->queues);
295 static const struct tc_ops tc_ops_htb;
296 static const struct tc_ops tc_ops_hfsc;
297 static const struct tc_ops tc_ops_default;
298 static const struct tc_ops tc_ops_other;
300 static const struct tc_ops *tcs[] = {
301 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
302 &tc_ops_hfsc, /* Hierarchical fair service curve. */
303 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
304 &tc_ops_other, /* Some other qdisc. */
308 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
309 static unsigned int tc_get_major(unsigned int handle);
310 static unsigned int tc_get_minor(unsigned int handle);
312 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
313 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
314 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
316 static struct tcmsg *tc_make_request(const struct netdev *, int type,
317 unsigned int flags, struct ofpbuf *);
318 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
320 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
321 struct nlattr **options);
322 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
323 struct nlattr **options,
324 struct netdev_queue_stats *);
325 static int tc_query_class(const struct netdev *,
326 unsigned int handle, unsigned int parent,
327 struct ofpbuf **replyp);
328 static int tc_delete_class(const struct netdev *, unsigned int handle);
330 static int tc_del_qdisc(struct netdev *netdev);
331 static int tc_query_qdisc(const struct netdev *netdev);
333 static int tc_calc_cell_log(unsigned int mtu);
334 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
335 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
336 const struct tc_ratespec *rate);
337 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
339 struct netdev_dev_linux {
340 struct netdev_dev netdev_dev;
342 struct shash_node *shash_node;
343 unsigned int cache_valid;
345 /* The following are figured out "on demand" only. They are only valid
346 * when the corresponding VALID_* bit in 'cache_valid' is set. */
348 uint8_t etheraddr[ETH_ADDR_LEN];
349 struct in_addr address, netmask;
353 bool is_internal; /* Is this an openvswitch internal device? */
354 bool is_tap; /* Is this a tuntap device? */
355 uint32_t kbits_rate; /* Policing data. */
356 uint32_t kbits_burst;
357 bool have_vport_stats;
361 struct tap_state tap;
365 struct netdev_linux {
366 struct netdev netdev;
370 /* An AF_INET socket (used for ioctl operations). */
371 static int af_inet_sock = -1;
373 /* A Netlink routing socket that is not subscribed to any multicast groups. */
374 static struct nl_sock *rtnl_sock;
376 struct netdev_linux_notifier {
377 struct netdev_notifier notifier;
381 static struct shash netdev_linux_notifiers =
382 SHASH_INITIALIZER(&netdev_linux_notifiers);
383 static struct rtnetlink_notifier netdev_linux_poll_notifier;
385 /* This is set pretty low because we probably won't learn anything from the
386 * additional log messages. */
387 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
389 static int netdev_linux_init(void);
391 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
392 int cmd, const char *cmd_name);
393 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
394 const char *cmd_name);
395 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
396 int cmd, const char *cmd_name);
397 static int get_flags(const struct netdev *, int *flagsp);
398 static int set_flags(struct netdev *, int flags);
399 static int do_get_ifindex(const char *netdev_name);
400 static int get_ifindex(const struct netdev *, int *ifindexp);
401 static int do_set_addr(struct netdev *netdev,
402 int ioctl_nr, const char *ioctl_name,
403 struct in_addr addr);
404 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
405 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
406 const uint8_t[ETH_ADDR_LEN]);
407 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
408 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
411 is_netdev_linux_class(const struct netdev_class *netdev_class)
413 return netdev_class->init == netdev_linux_init;
416 static struct netdev_dev_linux *
417 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
419 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
420 assert(is_netdev_linux_class(netdev_class));
422 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
425 static struct netdev_linux *
426 netdev_linux_cast(const struct netdev *netdev)
428 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
429 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
430 assert(is_netdev_linux_class(netdev_class));
432 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
436 netdev_linux_init(void)
438 static int status = -1;
440 /* Create AF_INET socket. */
441 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
442 status = af_inet_sock >= 0 ? 0 : errno;
444 VLOG_ERR("failed to create inet socket: %s", strerror(status));
447 /* Create rtnetlink socket. */
449 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
451 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
460 netdev_linux_run(void)
462 rtnetlink_link_notifier_run();
466 netdev_linux_wait(void)
468 rtnetlink_link_notifier_wait();
472 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
473 void *aux OVS_UNUSED)
475 struct netdev_dev_linux *dev;
477 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
479 const struct netdev_class *netdev_class =
480 netdev_dev_get_class(base_dev);
482 if (is_netdev_linux_class(netdev_class)) {
483 dev = netdev_dev_linux_cast(base_dev);
484 dev->cache_valid = 0;
488 struct shash device_shash;
489 struct shash_node *node;
491 shash_init(&device_shash);
492 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
493 SHASH_FOR_EACH (node, &device_shash) {
495 dev->cache_valid = 0;
497 shash_destroy(&device_shash);
501 /* Creates system and internal devices. */
503 netdev_linux_create(const struct netdev_class *class,
504 const char *name, const struct shash *args,
505 struct netdev_dev **netdev_devp)
507 struct netdev_dev_linux *netdev_dev;
510 if (!shash_is_empty(args)) {
511 VLOG_WARN("%s: arguments for %s devices should be empty",
515 if (!cache_notifier_refcount) {
516 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
517 netdev_linux_cache_cb, NULL);
522 cache_notifier_refcount++;
524 netdev_dev = xzalloc(sizeof *netdev_dev);
525 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
527 *netdev_devp = &netdev_dev->netdev_dev;
531 /* For most types of netdevs we open the device for each call of
532 * netdev_open(). However, this is not the case with tap devices,
533 * since it is only possible to open the device once. In this
534 * situation we share a single file descriptor, and consequently
535 * buffers, across all readers. Therefore once data is read it will
536 * be unavailable to other reads for tap devices. */
538 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
539 const char *name, const struct shash *args,
540 struct netdev_dev **netdev_devp)
542 struct netdev_dev_linux *netdev_dev;
543 struct tap_state *state;
544 static const char tap_dev[] = "/dev/net/tun";
548 if (!shash_is_empty(args)) {
549 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
552 netdev_dev = xzalloc(sizeof *netdev_dev);
553 state = &netdev_dev->state.tap;
555 /* Open tap device. */
556 state->fd = open(tap_dev, O_RDWR);
559 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
563 /* Create tap device. */
564 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
565 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
566 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
567 VLOG_WARN("%s: creating tap device failed: %s", name,
573 /* Make non-blocking. */
574 error = set_nonblocking(state->fd);
579 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
580 *netdev_devp = &netdev_dev->netdev_dev;
589 destroy_tap(struct netdev_dev_linux *netdev_dev)
591 struct tap_state *state = &netdev_dev->state.tap;
593 if (state->fd >= 0) {
598 /* Destroys the netdev device 'netdev_dev_'. */
600 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
602 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
603 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
605 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
606 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
609 if (class == &netdev_linux_class || class == &netdev_internal_class) {
610 cache_notifier_refcount--;
612 if (!cache_notifier_refcount) {
613 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
615 } else if (class == &netdev_tap_class) {
616 destroy_tap(netdev_dev);
625 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
626 struct netdev **netdevp)
628 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
629 struct netdev_linux *netdev;
630 enum netdev_flags flags;
633 /* Allocate network device. */
634 netdev = xzalloc(sizeof *netdev);
636 netdev_init(&netdev->netdev, netdev_dev_);
638 /* Verify that the device really exists, by attempting to read its flags.
639 * (The flags might be cached, in which case this won't actually do an
642 * Don't do this for "internal" netdevs, though, because those have to be
643 * created as netdev objects before they exist in the kernel, because
644 * creating them in the kernel happens by passing a netdev object to
645 * dpif_port_add(). */
646 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
647 error = netdev_get_flags(&netdev->netdev, &flags);
648 if (error == ENODEV) {
653 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
654 !netdev_dev->state.tap.opened) {
656 /* We assume that the first user of the tap device is the primary user
657 * and give them the tap FD. Subsequent users probably just expect
658 * this to be a system device so open it normally to avoid send/receive
659 * directions appearing to be reversed. */
660 netdev->fd = netdev_dev->state.tap.fd;
661 netdev_dev->state.tap.opened = true;
662 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
663 struct sockaddr_ll sll;
667 /* Create file descriptor. */
668 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
669 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
671 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
672 if (netdev->fd < 0) {
677 /* Set non-blocking mode. */
678 error = set_nonblocking(netdev->fd);
683 /* Get ethernet device index. */
684 error = get_ifindex(&netdev->netdev, &ifindex);
689 /* Bind to specific ethernet device. */
690 memset(&sll, 0, sizeof sll);
691 sll.sll_family = AF_PACKET;
692 sll.sll_ifindex = ifindex;
694 (struct sockaddr *) &sll, sizeof sll) < 0) {
696 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
701 /* Between the socket() and bind() calls above, the socket receives all
702 * packets of the requested type on all system interfaces. We do not
703 * want to receive that data, but there is no way to avoid it. So we
704 * must now drain out the receive queue. */
705 error = drain_rcvbuf(netdev->fd);
711 *netdevp = &netdev->netdev;
715 netdev_uninit(&netdev->netdev, true);
719 /* Closes and destroys 'netdev'. */
721 netdev_linux_close(struct netdev *netdev_)
723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
725 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
731 /* Initializes 'svec' with a list of the names of all known network devices. */
733 netdev_linux_enumerate(struct svec *svec)
735 struct if_nameindex *names;
737 names = if_nameindex();
741 for (i = 0; names[i].if_name != NULL; i++) {
742 svec_add(svec, names[i].if_name);
744 if_freenameindex(names);
747 VLOG_WARN("could not obtain list of network device names: %s",
754 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
756 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758 if (netdev->fd < 0) {
759 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
764 ssize_t retval = read(netdev->fd, data, size);
767 } else if (errno != EINTR) {
768 if (errno != EAGAIN) {
769 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
770 strerror(errno), netdev_get_name(netdev_));
777 /* Registers with the poll loop to wake up from the next call to poll_block()
778 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
780 netdev_linux_recv_wait(struct netdev *netdev_)
782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
783 if (netdev->fd >= 0) {
784 poll_fd_wait(netdev->fd, POLLIN);
788 /* Discards all packets waiting to be received from 'netdev'. */
790 netdev_linux_drain(struct netdev *netdev_)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
793 if (netdev->fd < 0) {
795 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
797 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
798 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
802 drain_fd(netdev->fd, ifr.ifr_qlen);
805 return drain_rcvbuf(netdev->fd);
809 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
810 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
811 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
812 * the packet is too big or too small to transmit on the device.
814 * The caller retains ownership of 'buffer' in all cases.
816 * The kernel maintains a packet transmission queue, so the caller is not
817 * expected to do additional queuing of packets. */
819 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
821 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
823 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
825 if (netdev->fd < 0) {
830 ssize_t retval = write(netdev->fd, data, size);
832 /* The Linux AF_PACKET implementation never blocks waiting for room
833 * for packets, instead returning ENOBUFS. Translate this into
834 * EAGAIN for the caller. */
835 if (errno == ENOBUFS) {
837 } else if (errno == EINTR) {
839 } else if (errno != EAGAIN) {
840 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
841 netdev_get_name(netdev_), strerror(errno));
844 } else if (retval != size) {
845 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
846 "%zu) on %s", retval, size, netdev_get_name(netdev_));
854 /* Registers with the poll loop to wake up from the next call to poll_block()
855 * when the packet transmission queue has sufficient room to transmit a packet
856 * with netdev_send().
858 * The kernel maintains a packet transmission queue, so the client is not
859 * expected to do additional queuing of packets. Thus, this function is
860 * unlikely to ever be used. It is included for completeness. */
862 netdev_linux_send_wait(struct netdev *netdev_)
864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
865 if (netdev->fd < 0) {
867 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
868 poll_fd_wait(netdev->fd, POLLOUT);
870 /* TAP device always accepts packets.*/
871 poll_immediate_wake();
875 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
876 * otherwise a positive errno value. */
878 netdev_linux_set_etheraddr(struct netdev *netdev_,
879 const uint8_t mac[ETH_ADDR_LEN])
881 struct netdev_dev_linux *netdev_dev =
882 netdev_dev_linux_cast(netdev_get_dev(netdev_));
885 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
886 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
887 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
889 netdev_dev->cache_valid |= VALID_ETHERADDR;
890 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
898 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
899 * free the returned buffer. */
901 netdev_linux_get_etheraddr(const struct netdev *netdev_,
902 uint8_t mac[ETH_ADDR_LEN])
904 struct netdev_dev_linux *netdev_dev =
905 netdev_dev_linux_cast(netdev_get_dev(netdev_));
906 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
907 int error = get_etheraddr(netdev_get_name(netdev_),
908 netdev_dev->etheraddr);
912 netdev_dev->cache_valid |= VALID_ETHERADDR;
914 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
918 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
919 * in bytes, not including the hardware header; thus, this is typically 1500
920 * bytes for Ethernet devices. */
922 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
924 struct netdev_dev_linux *netdev_dev =
925 netdev_dev_linux_cast(netdev_get_dev(netdev_));
926 if (!(netdev_dev->cache_valid & VALID_MTU)) {
930 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
931 SIOCGIFMTU, "SIOCGIFMTU");
935 netdev_dev->mtu = ifr.ifr_mtu;
936 netdev_dev->cache_valid |= VALID_MTU;
938 *mtup = netdev_dev->mtu;
942 /* Returns the ifindex of 'netdev', if successful, as a positive number.
943 * On failure, returns a negative errno value. */
945 netdev_linux_get_ifindex(const struct netdev *netdev)
949 error = get_ifindex(netdev, &ifindex);
950 return error ? -error : ifindex;
954 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
956 struct netdev_dev_linux *netdev_dev =
957 netdev_dev_linux_cast(netdev_get_dev(netdev_));
962 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
966 fn = xasprintf("/sys/class/net/%s/carrier",
967 netdev_get_name(netdev_));
968 fd = open(fn, O_RDONLY);
971 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
975 retval = read(fd, line, sizeof line);
978 if (error == EINVAL) {
979 /* This is the normal return value when we try to check carrier
980 * if the network device is not up. */
982 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
985 } else if (retval == 0) {
987 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
991 if (line[0] != '0' && line[0] != '1') {
993 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
997 netdev_dev->carrier = line[0] != '0';
998 netdev_dev->cache_valid |= VALID_CARRIER;
1000 *carrier = netdev_dev->carrier;
1012 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1013 const char *cmd_name, struct mii_ioctl_data *data)
1018 memset(&ifr, 0, sizeof ifr);
1019 memcpy(&ifr.ifr_data, data, sizeof *data);
1020 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1021 &ifr, cmd, cmd_name);
1022 memcpy(data, &ifr.ifr_data, sizeof *data);
1028 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1030 const char *name = netdev_get_name(netdev);
1031 struct mii_ioctl_data data;
1036 memset(&data, 0, sizeof data);
1037 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1039 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1040 data.reg_num = MII_BMSR;
1041 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1045 *miimon = !!(data.val_out & BMSR_LSTATUS);
1047 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1050 struct ethtool_cmd ecmd;
1052 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1055 memset(&ecmd, 0, sizeof ecmd);
1056 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1059 struct ethtool_value eval;
1061 memcpy(&eval, &ecmd, sizeof eval);
1062 *miimon = !!eval.data;
1064 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1071 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1072 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1075 check_for_working_netlink_stats(void)
1077 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1078 * preferable, so if that works, we'll use it. */
1079 int ifindex = do_get_ifindex("lo");
1081 VLOG_WARN("failed to get ifindex for lo, "
1082 "obtaining netdev stats from proc");
1085 struct netdev_stats stats;
1086 int error = get_stats_via_netlink(ifindex, &stats);
1088 VLOG_DBG("obtaining netdev stats via rtnetlink");
1091 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1092 "via proc (you are probably running a pre-2.6.19 "
1093 "kernel)", strerror(error));
1099 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1101 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1103 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1104 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1105 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1107 netdev_dev->is_tap = !strcmp(type, "tap");
1108 netdev_dev->is_internal = false;
1109 if (!netdev_dev->is_tap) {
1110 struct ethtool_drvinfo drvinfo;
1113 memset(&drvinfo, 0, sizeof drvinfo);
1114 error = netdev_linux_do_ethtool(name,
1115 (struct ethtool_cmd *)&drvinfo,
1117 "ETHTOOL_GDRVINFO");
1119 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1120 netdev_dev->is_internal = true;
1124 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1129 swap_uint64(uint64_t *a, uint64_t *b)
1136 /* Retrieves current device stats for 'netdev'. */
1138 netdev_linux_get_stats(const struct netdev *netdev_,
1139 struct netdev_stats *stats)
1141 struct netdev_dev_linux *netdev_dev =
1142 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1143 static int use_netlink_stats = -1;
1146 if (netdev_dev->have_vport_stats ||
1147 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1149 error = netdev_vport_get_stats(netdev_, stats);
1150 netdev_dev->have_vport_stats = !error;
1151 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1154 if (!netdev_dev->have_vport_stats) {
1155 if (use_netlink_stats < 0) {
1156 use_netlink_stats = check_for_working_netlink_stats();
1158 if (use_netlink_stats) {
1161 error = get_ifindex(netdev_, &ifindex);
1163 error = get_stats_via_netlink(ifindex, stats);
1166 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1170 /* If this port is an internal port then the transmit and receive stats
1171 * will appear to be swapped relative to the other ports since we are the
1172 * one sending the data, not a remote computer. For consistency, we swap
1173 * them back here. This does not apply if we are getting stats from the
1174 * vport layer because it always tracks stats from the perspective of the
1176 netdev_linux_update_is_pseudo(netdev_dev);
1177 if (!error && !netdev_dev->have_vport_stats &&
1178 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1179 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1180 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1181 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1182 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1183 stats->rx_length_errors = 0;
1184 stats->rx_over_errors = 0;
1185 stats->rx_crc_errors = 0;
1186 stats->rx_frame_errors = 0;
1187 stats->rx_fifo_errors = 0;
1188 stats->rx_missed_errors = 0;
1189 stats->tx_aborted_errors = 0;
1190 stats->tx_carrier_errors = 0;
1191 stats->tx_fifo_errors = 0;
1192 stats->tx_heartbeat_errors = 0;
1193 stats->tx_window_errors = 0;
1199 /* Stores the features supported by 'netdev' into each of '*current',
1200 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1201 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1202 * successful, otherwise a positive errno value. */
1204 netdev_linux_get_features(const struct netdev *netdev,
1205 uint32_t *current, uint32_t *advertised,
1206 uint32_t *supported, uint32_t *peer)
1208 struct ethtool_cmd ecmd;
1211 memset(&ecmd, 0, sizeof ecmd);
1212 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1213 ETHTOOL_GSET, "ETHTOOL_GSET");
1218 /* Supported features. */
1220 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1221 *supported |= OFPPF_10MB_HD;
1223 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1224 *supported |= OFPPF_10MB_FD;
1226 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1227 *supported |= OFPPF_100MB_HD;
1229 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1230 *supported |= OFPPF_100MB_FD;
1232 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1233 *supported |= OFPPF_1GB_HD;
1235 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1236 *supported |= OFPPF_1GB_FD;
1238 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1239 *supported |= OFPPF_10GB_FD;
1241 if (ecmd.supported & SUPPORTED_TP) {
1242 *supported |= OFPPF_COPPER;
1244 if (ecmd.supported & SUPPORTED_FIBRE) {
1245 *supported |= OFPPF_FIBER;
1247 if (ecmd.supported & SUPPORTED_Autoneg) {
1248 *supported |= OFPPF_AUTONEG;
1250 if (ecmd.supported & SUPPORTED_Pause) {
1251 *supported |= OFPPF_PAUSE;
1253 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1254 *supported |= OFPPF_PAUSE_ASYM;
1257 /* Advertised features. */
1259 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1260 *advertised |= OFPPF_10MB_HD;
1262 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1263 *advertised |= OFPPF_10MB_FD;
1265 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1266 *advertised |= OFPPF_100MB_HD;
1268 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1269 *advertised |= OFPPF_100MB_FD;
1271 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1272 *advertised |= OFPPF_1GB_HD;
1274 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1275 *advertised |= OFPPF_1GB_FD;
1277 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1278 *advertised |= OFPPF_10GB_FD;
1280 if (ecmd.advertising & ADVERTISED_TP) {
1281 *advertised |= OFPPF_COPPER;
1283 if (ecmd.advertising & ADVERTISED_FIBRE) {
1284 *advertised |= OFPPF_FIBER;
1286 if (ecmd.advertising & ADVERTISED_Autoneg) {
1287 *advertised |= OFPPF_AUTONEG;
1289 if (ecmd.advertising & ADVERTISED_Pause) {
1290 *advertised |= OFPPF_PAUSE;
1292 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1293 *advertised |= OFPPF_PAUSE_ASYM;
1296 /* Current settings. */
1297 if (ecmd.speed == SPEED_10) {
1298 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1299 } else if (ecmd.speed == SPEED_100) {
1300 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1301 } else if (ecmd.speed == SPEED_1000) {
1302 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1303 } else if (ecmd.speed == SPEED_10000) {
1304 *current = OFPPF_10GB_FD;
1309 if (ecmd.port == PORT_TP) {
1310 *current |= OFPPF_COPPER;
1311 } else if (ecmd.port == PORT_FIBRE) {
1312 *current |= OFPPF_FIBER;
1316 *current |= OFPPF_AUTONEG;
1319 /* Peer advertisements. */
1320 *peer = 0; /* XXX */
1325 /* Set the features advertised by 'netdev' to 'advertise'. */
1327 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1329 struct ethtool_cmd ecmd;
1332 memset(&ecmd, 0, sizeof ecmd);
1333 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1334 ETHTOOL_GSET, "ETHTOOL_GSET");
1339 ecmd.advertising = 0;
1340 if (advertise & OFPPF_10MB_HD) {
1341 ecmd.advertising |= ADVERTISED_10baseT_Half;
1343 if (advertise & OFPPF_10MB_FD) {
1344 ecmd.advertising |= ADVERTISED_10baseT_Full;
1346 if (advertise & OFPPF_100MB_HD) {
1347 ecmd.advertising |= ADVERTISED_100baseT_Half;
1349 if (advertise & OFPPF_100MB_FD) {
1350 ecmd.advertising |= ADVERTISED_100baseT_Full;
1352 if (advertise & OFPPF_1GB_HD) {
1353 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1355 if (advertise & OFPPF_1GB_FD) {
1356 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1358 if (advertise & OFPPF_10GB_FD) {
1359 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1361 if (advertise & OFPPF_COPPER) {
1362 ecmd.advertising |= ADVERTISED_TP;
1364 if (advertise & OFPPF_FIBER) {
1365 ecmd.advertising |= ADVERTISED_FIBRE;
1367 if (advertise & OFPPF_AUTONEG) {
1368 ecmd.advertising |= ADVERTISED_Autoneg;
1370 if (advertise & OFPPF_PAUSE) {
1371 ecmd.advertising |= ADVERTISED_Pause;
1373 if (advertise & OFPPF_PAUSE_ASYM) {
1374 ecmd.advertising |= ADVERTISED_Asym_Pause;
1376 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1377 ETHTOOL_SSET, "ETHTOOL_SSET");
1380 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1381 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1382 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1383 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1384 * sets '*vlan_vid' to -1. */
1386 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1388 const char *netdev_name = netdev_get_name(netdev);
1389 struct ds line = DS_EMPTY_INITIALIZER;
1390 FILE *stream = NULL;
1394 COVERAGE_INC(netdev_get_vlan_vid);
1395 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1396 stream = fopen(fn, "r");
1402 if (ds_get_line(&line, stream)) {
1403 if (ferror(stream)) {
1405 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1408 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1413 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1415 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1416 fn, ds_cstr(&line));
1434 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1435 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1437 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1438 * positive errno value.
1440 * This function is equivalent to running
1441 * /sbin/tc qdisc del dev %s handle ffff: ingress
1442 * but it is much, much faster.
1445 netdev_linux_remove_policing(struct netdev *netdev)
1447 struct netdev_dev_linux *netdev_dev =
1448 netdev_dev_linux_cast(netdev_get_dev(netdev));
1449 const char *netdev_name = netdev_get_name(netdev);
1451 struct ofpbuf request;
1452 struct tcmsg *tcmsg;
1455 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1459 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1460 tcmsg->tcm_parent = TC_H_INGRESS;
1461 nl_msg_put_string(&request, TCA_KIND, "ingress");
1462 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1464 error = tc_transact(&request, NULL);
1465 if (error && error != ENOENT && error != EINVAL) {
1466 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1467 netdev_name, strerror(error));
1471 netdev_dev->kbits_rate = 0;
1472 netdev_dev->kbits_burst = 0;
1473 netdev_dev->cache_valid |= VALID_POLICING;
1477 /* Attempts to set input rate limiting (policing) policy. */
1479 netdev_linux_set_policing(struct netdev *netdev,
1480 uint32_t kbits_rate, uint32_t kbits_burst)
1482 struct netdev_dev_linux *netdev_dev =
1483 netdev_dev_linux_cast(netdev_get_dev(netdev));
1484 const char *netdev_name = netdev_get_name(netdev);
1487 COVERAGE_INC(netdev_set_policing);
1489 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1490 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1491 : kbits_burst); /* Stick with user-specified value. */
1493 if (netdev_dev->cache_valid & VALID_POLICING
1494 && netdev_dev->kbits_rate == kbits_rate
1495 && netdev_dev->kbits_burst == kbits_burst) {
1496 /* Assume that settings haven't changed since we last set them. */
1500 netdev_linux_remove_policing(netdev);
1502 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1503 if (system(command) != 0) {
1504 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1508 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1509 kbits_rate, kbits_burst);
1510 if (system(command) != 0) {
1511 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1516 netdev_dev->kbits_rate = kbits_rate;
1517 netdev_dev->kbits_burst = kbits_burst;
1518 netdev_dev->cache_valid |= VALID_POLICING;
1525 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1528 const struct tc_ops **opsp;
1530 for (opsp = tcs; *opsp != NULL; opsp++) {
1531 const struct tc_ops *ops = *opsp;
1532 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1533 svec_add(types, ops->ovs_name);
1539 static const struct tc_ops *
1540 tc_lookup_ovs_name(const char *name)
1542 const struct tc_ops **opsp;
1544 for (opsp = tcs; *opsp != NULL; opsp++) {
1545 const struct tc_ops *ops = *opsp;
1546 if (!strcmp(name, ops->ovs_name)) {
1553 static const struct tc_ops *
1554 tc_lookup_linux_name(const char *name)
1556 const struct tc_ops **opsp;
1558 for (opsp = tcs; *opsp != NULL; opsp++) {
1559 const struct tc_ops *ops = *opsp;
1560 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1567 static struct tc_queue *
1568 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1571 struct netdev_dev_linux *netdev_dev =
1572 netdev_dev_linux_cast(netdev_get_dev(netdev));
1573 struct tc_queue *queue;
1575 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1576 if (queue->queue_id == queue_id) {
1583 static struct tc_queue *
1584 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1586 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1590 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1592 struct netdev_qos_capabilities *caps)
1594 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1598 caps->n_queues = ops->n_queues;
1603 netdev_linux_get_qos(const struct netdev *netdev,
1604 const char **typep, struct shash *details)
1606 struct netdev_dev_linux *netdev_dev =
1607 netdev_dev_linux_cast(netdev_get_dev(netdev));
1610 error = tc_query_qdisc(netdev);
1615 *typep = netdev_dev->tc->ops->ovs_name;
1616 return (netdev_dev->tc->ops->qdisc_get
1617 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1622 netdev_linux_set_qos(struct netdev *netdev,
1623 const char *type, const struct shash *details)
1625 struct netdev_dev_linux *netdev_dev =
1626 netdev_dev_linux_cast(netdev_get_dev(netdev));
1627 const struct tc_ops *new_ops;
1630 new_ops = tc_lookup_ovs_name(type);
1631 if (!new_ops || !new_ops->tc_install) {
1635 error = tc_query_qdisc(netdev);
1640 if (new_ops == netdev_dev->tc->ops) {
1641 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1643 /* Delete existing qdisc. */
1644 error = tc_del_qdisc(netdev);
1648 assert(netdev_dev->tc == NULL);
1650 /* Install new qdisc. */
1651 error = new_ops->tc_install(netdev, details);
1652 assert((error == 0) == (netdev_dev->tc != NULL));
1659 netdev_linux_get_queue(const struct netdev *netdev,
1660 unsigned int queue_id, struct shash *details)
1662 struct netdev_dev_linux *netdev_dev =
1663 netdev_dev_linux_cast(netdev_get_dev(netdev));
1666 error = tc_query_qdisc(netdev);
1670 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1672 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1678 netdev_linux_set_queue(struct netdev *netdev,
1679 unsigned int queue_id, const struct shash *details)
1681 struct netdev_dev_linux *netdev_dev =
1682 netdev_dev_linux_cast(netdev_get_dev(netdev));
1685 error = tc_query_qdisc(netdev);
1688 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1689 || !netdev_dev->tc->ops->class_set) {
1693 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1697 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1699 struct netdev_dev_linux *netdev_dev =
1700 netdev_dev_linux_cast(netdev_get_dev(netdev));
1703 error = tc_query_qdisc(netdev);
1706 } else if (!netdev_dev->tc->ops->class_delete) {
1709 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1711 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1717 netdev_linux_get_queue_stats(const struct netdev *netdev,
1718 unsigned int queue_id,
1719 struct netdev_queue_stats *stats)
1721 struct netdev_dev_linux *netdev_dev =
1722 netdev_dev_linux_cast(netdev_get_dev(netdev));
1725 error = tc_query_qdisc(netdev);
1728 } else if (!netdev_dev->tc->ops->class_get_stats) {
1731 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1733 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1739 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1741 struct ofpbuf request;
1742 struct tcmsg *tcmsg;
1744 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1748 tcmsg->tcm_parent = 0;
1749 nl_dump_start(dump, rtnl_sock, &request);
1750 ofpbuf_uninit(&request);
1755 netdev_linux_dump_queues(const struct netdev *netdev,
1756 netdev_dump_queues_cb *cb, void *aux)
1758 struct netdev_dev_linux *netdev_dev =
1759 netdev_dev_linux_cast(netdev_get_dev(netdev));
1760 struct tc_queue *queue;
1761 struct shash details;
1765 error = tc_query_qdisc(netdev);
1768 } else if (!netdev_dev->tc->ops->class_get) {
1773 shash_init(&details);
1774 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1775 shash_clear(&details);
1777 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1779 (*cb)(queue->queue_id, &details, aux);
1784 shash_destroy(&details);
1790 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1791 netdev_dump_queue_stats_cb *cb, void *aux)
1793 struct netdev_dev_linux *netdev_dev =
1794 netdev_dev_linux_cast(netdev_get_dev(netdev));
1795 struct nl_dump dump;
1800 error = tc_query_qdisc(netdev);
1803 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1808 if (!start_queue_dump(netdev, &dump)) {
1811 while (nl_dump_next(&dump, &msg)) {
1812 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1818 error = nl_dump_done(&dump);
1819 return error ? error : last_error;
1823 netdev_linux_get_in4(const struct netdev *netdev_,
1824 struct in_addr *address, struct in_addr *netmask)
1826 struct netdev_dev_linux *netdev_dev =
1827 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1829 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1832 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1833 SIOCGIFADDR, "SIOCGIFADDR");
1838 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1839 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1844 netdev_dev->cache_valid |= VALID_IN4;
1846 *address = netdev_dev->address;
1847 *netmask = netdev_dev->netmask;
1848 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1852 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1853 struct in_addr netmask)
1855 struct netdev_dev_linux *netdev_dev =
1856 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1859 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1861 netdev_dev->cache_valid |= VALID_IN4;
1862 netdev_dev->address = address;
1863 netdev_dev->netmask = netmask;
1864 if (address.s_addr != INADDR_ANY) {
1865 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1866 "SIOCSIFNETMASK", netmask);
1873 parse_if_inet6_line(const char *line,
1874 struct in6_addr *in6, char ifname[16 + 1])
1876 uint8_t *s6 = in6->s6_addr;
1877 #define X8 "%2"SCNx8
1879 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1880 "%*x %*x %*x %*x %16s\n",
1881 &s6[0], &s6[1], &s6[2], &s6[3],
1882 &s6[4], &s6[5], &s6[6], &s6[7],
1883 &s6[8], &s6[9], &s6[10], &s6[11],
1884 &s6[12], &s6[13], &s6[14], &s6[15],
1888 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1889 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1891 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1893 struct netdev_dev_linux *netdev_dev =
1894 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1895 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1899 netdev_dev->in6 = in6addr_any;
1901 file = fopen("/proc/net/if_inet6", "r");
1903 const char *name = netdev_get_name(netdev_);
1904 while (fgets(line, sizeof line, file)) {
1905 struct in6_addr in6_tmp;
1906 char ifname[16 + 1];
1907 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1908 && !strcmp(name, ifname))
1910 netdev_dev->in6 = in6_tmp;
1916 netdev_dev->cache_valid |= VALID_IN6;
1918 *in6 = netdev_dev->in6;
1923 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1925 struct sockaddr_in sin;
1926 memset(&sin, 0, sizeof sin);
1927 sin.sin_family = AF_INET;
1928 sin.sin_addr = addr;
1931 memset(sa, 0, sizeof *sa);
1932 memcpy(sa, &sin, sizeof sin);
1936 do_set_addr(struct netdev *netdev,
1937 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1940 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1941 make_in4_sockaddr(&ifr.ifr_addr, addr);
1943 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1947 /* Adds 'router' as a default IP gateway. */
1949 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1951 struct in_addr any = { INADDR_ANY };
1955 memset(&rt, 0, sizeof rt);
1956 make_in4_sockaddr(&rt.rt_dst, any);
1957 make_in4_sockaddr(&rt.rt_gateway, router);
1958 make_in4_sockaddr(&rt.rt_genmask, any);
1959 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1960 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1962 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1968 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1971 static const char fn[] = "/proc/net/route";
1976 *netdev_name = NULL;
1977 stream = fopen(fn, "r");
1978 if (stream == NULL) {
1979 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1984 while (fgets(line, sizeof line, stream)) {
1987 uint32_t dest, gateway, mask;
1988 int refcnt, metric, mtu;
1989 unsigned int flags, use, window, irtt;
1992 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1994 iface, &dest, &gateway, &flags, &refcnt,
1995 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1997 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2001 if (!(flags & RTF_UP)) {
2002 /* Skip routes that aren't up. */
2006 /* The output of 'dest', 'mask', and 'gateway' were given in
2007 * network byte order, so we don't need need any endian
2008 * conversions here. */
2009 if ((dest & mask) == (host->s_addr & mask)) {
2011 /* The host is directly reachable. */
2012 next_hop->s_addr = 0;
2014 /* To reach the host, we must go through a gateway. */
2015 next_hop->s_addr = gateway;
2017 *netdev_name = xstrdup(iface);
2029 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2031 struct ethtool_drvinfo drvinfo;
2034 memset(&drvinfo, 0, sizeof drvinfo);
2035 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2036 (struct ethtool_cmd *)&drvinfo,
2038 "ETHTOOL_GDRVINFO");
2040 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2041 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2042 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2048 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2049 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2050 * returns 0. Otherwise, it returns a positive errno value; in particular,
2051 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2053 netdev_linux_arp_lookup(const struct netdev *netdev,
2054 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2057 struct sockaddr_in sin;
2060 memset(&r, 0, sizeof r);
2061 sin.sin_family = AF_INET;
2062 sin.sin_addr.s_addr = ip;
2064 memcpy(&r.arp_pa, &sin, sizeof sin);
2065 r.arp_ha.sa_family = ARPHRD_ETHER;
2067 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2068 COVERAGE_INC(netdev_arp_lookup);
2069 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2071 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2072 } else if (retval != ENXIO) {
2073 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2074 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2080 nd_to_iff_flags(enum netdev_flags nd)
2083 if (nd & NETDEV_UP) {
2086 if (nd & NETDEV_PROMISC) {
2093 iff_to_nd_flags(int iff)
2095 enum netdev_flags nd = 0;
2099 if (iff & IFF_PROMISC) {
2100 nd |= NETDEV_PROMISC;
2106 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2107 enum netdev_flags on, enum netdev_flags *old_flagsp)
2109 int old_flags, new_flags;
2112 error = get_flags(netdev, &old_flags);
2114 *old_flagsp = iff_to_nd_flags(old_flags);
2115 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2116 if (new_flags != old_flags) {
2117 error = set_flags(netdev, new_flags);
2124 poll_notify(struct list *list)
2126 struct netdev_linux_notifier *notifier;
2127 LIST_FOR_EACH (notifier, node, list) {
2128 struct netdev_notifier *n = ¬ifier->notifier;
2134 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2135 void *aux OVS_UNUSED)
2138 struct list *list = shash_find_data(&netdev_linux_notifiers,
2144 struct shash_node *node;
2145 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2146 poll_notify(node->data);
2152 netdev_linux_poll_add(struct netdev *netdev,
2153 void (*cb)(struct netdev_notifier *), void *aux,
2154 struct netdev_notifier **notifierp)
2156 const char *netdev_name = netdev_get_name(netdev);
2157 struct netdev_linux_notifier *notifier;
2160 if (shash_is_empty(&netdev_linux_notifiers)) {
2162 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2163 netdev_linux_poll_cb, NULL);
2169 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2171 list = xmalloc(sizeof *list);
2173 shash_add(&netdev_linux_notifiers, netdev_name, list);
2176 notifier = xmalloc(sizeof *notifier);
2177 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2178 list_push_back(list, ¬ifier->node);
2179 *notifierp = ¬ifier->notifier;
2184 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2186 struct netdev_linux_notifier *notifier =
2187 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2190 /* Remove 'notifier' from its list. */
2191 list = list_remove(¬ifier->node);
2192 if (list_is_empty(list)) {
2193 /* The list is now empty. Remove it from the hash and free it. */
2194 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2195 shash_delete(&netdev_linux_notifiers,
2196 shash_find(&netdev_linux_notifiers, netdev_name));
2201 /* If that was the last notifier, unregister. */
2202 if (shash_is_empty(&netdev_linux_notifiers)) {
2203 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2207 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2211 netdev_linux_init, \
2213 netdev_linux_wait, \
2216 netdev_linux_destroy, \
2217 NULL, /* reconfigure */ \
2219 netdev_linux_open, \
2220 netdev_linux_close, \
2224 netdev_linux_recv, \
2225 netdev_linux_recv_wait, \
2226 netdev_linux_drain, \
2228 netdev_linux_send, \
2229 netdev_linux_send_wait, \
2231 netdev_linux_set_etheraddr, \
2232 netdev_linux_get_etheraddr, \
2233 netdev_linux_get_mtu, \
2234 netdev_linux_get_ifindex, \
2235 netdev_linux_get_carrier, \
2236 netdev_linux_get_miimon, \
2237 netdev_linux_get_stats, \
2240 netdev_linux_get_features, \
2241 netdev_linux_set_advertisements, \
2242 netdev_linux_get_vlan_vid, \
2244 netdev_linux_set_policing, \
2245 netdev_linux_get_qos_types, \
2246 netdev_linux_get_qos_capabilities, \
2247 netdev_linux_get_qos, \
2248 netdev_linux_set_qos, \
2249 netdev_linux_get_queue, \
2250 netdev_linux_set_queue, \
2251 netdev_linux_delete_queue, \
2252 netdev_linux_get_queue_stats, \
2253 netdev_linux_dump_queues, \
2254 netdev_linux_dump_queue_stats, \
2256 netdev_linux_get_in4, \
2257 netdev_linux_set_in4, \
2258 netdev_linux_get_in6, \
2259 netdev_linux_add_router, \
2260 netdev_linux_get_next_hop, \
2261 netdev_linux_get_status, \
2262 netdev_linux_arp_lookup, \
2264 netdev_linux_update_flags, \
2266 netdev_linux_poll_add, \
2267 netdev_linux_poll_remove \
2270 const struct netdev_class netdev_linux_class =
2273 netdev_linux_create,
2274 netdev_linux_enumerate,
2275 NULL); /* set_stats */
2277 const struct netdev_class netdev_tap_class =
2280 netdev_linux_create_tap,
2281 NULL, /* enumerate */
2282 NULL); /* set_stats */
2284 const struct netdev_class netdev_internal_class =
2287 netdev_linux_create,
2288 NULL, /* enumerate */
2289 netdev_vport_set_stats);
2291 /* HTB traffic control class. */
2293 #define HTB_N_QUEUES 0xf000
2297 unsigned int max_rate; /* In bytes/s. */
2301 struct tc_queue tc_queue;
2302 unsigned int min_rate; /* In bytes/s. */
2303 unsigned int max_rate; /* In bytes/s. */
2304 unsigned int burst; /* In bytes. */
2305 unsigned int priority; /* Lower values are higher priorities. */
2309 htb_get__(const struct netdev *netdev)
2311 struct netdev_dev_linux *netdev_dev =
2312 netdev_dev_linux_cast(netdev_get_dev(netdev));
2313 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2317 htb_install__(struct netdev *netdev, uint64_t max_rate)
2319 struct netdev_dev_linux *netdev_dev =
2320 netdev_dev_linux_cast(netdev_get_dev(netdev));
2323 htb = xmalloc(sizeof *htb);
2324 tc_init(&htb->tc, &tc_ops_htb);
2325 htb->max_rate = max_rate;
2327 netdev_dev->tc = &htb->tc;
2332 /* Create an HTB qdisc.
2334 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2336 htb_setup_qdisc__(struct netdev *netdev)
2339 struct tc_htb_glob opt;
2340 struct ofpbuf request;
2341 struct tcmsg *tcmsg;
2343 tc_del_qdisc(netdev);
2345 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2346 NLM_F_EXCL | NLM_F_CREATE, &request);
2350 tcmsg->tcm_handle = tc_make_handle(1, 0);
2351 tcmsg->tcm_parent = TC_H_ROOT;
2353 nl_msg_put_string(&request, TCA_KIND, "htb");
2355 memset(&opt, 0, sizeof opt);
2356 opt.rate2quantum = 10;
2360 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2361 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2362 nl_msg_end_nested(&request, opt_offset);
2364 return tc_transact(&request, NULL);
2367 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2368 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2370 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2371 unsigned int parent, struct htb_class *class)
2374 struct tc_htb_opt opt;
2375 struct ofpbuf request;
2376 struct tcmsg *tcmsg;
2380 netdev_get_mtu(netdev, &mtu);
2382 memset(&opt, 0, sizeof opt);
2383 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2384 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2385 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2386 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2387 opt.prio = class->priority;
2389 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2393 tcmsg->tcm_handle = handle;
2394 tcmsg->tcm_parent = parent;
2396 nl_msg_put_string(&request, TCA_KIND, "htb");
2397 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2398 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2399 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2400 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2401 nl_msg_end_nested(&request, opt_offset);
2403 error = tc_transact(&request, NULL);
2405 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2406 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2407 netdev_get_name(netdev),
2408 tc_get_major(handle), tc_get_minor(handle),
2409 tc_get_major(parent), tc_get_minor(parent),
2410 class->min_rate, class->max_rate,
2411 class->burst, class->priority, strerror(error));
2416 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2417 * description of them into 'details'. The description complies with the
2418 * specification given in the vswitch database documentation for linux-htb
2421 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2423 static const struct nl_policy tca_htb_policy[] = {
2424 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2425 .min_len = sizeof(struct tc_htb_opt) },
2428 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2429 const struct tc_htb_opt *htb;
2431 if (!nl_parse_nested(nl_options, tca_htb_policy,
2432 attrs, ARRAY_SIZE(tca_htb_policy))) {
2433 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2437 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2438 class->min_rate = htb->rate.rate;
2439 class->max_rate = htb->ceil.rate;
2440 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2441 class->priority = htb->prio;
2446 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2447 struct htb_class *options,
2448 struct netdev_queue_stats *stats)
2450 struct nlattr *nl_options;
2451 unsigned int handle;
2454 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2455 if (!error && queue_id) {
2456 unsigned int major = tc_get_major(handle);
2457 unsigned int minor = tc_get_minor(handle);
2458 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2459 *queue_id = minor - 1;
2464 if (!error && options) {
2465 error = htb_parse_tca_options__(nl_options, options);
2471 htb_parse_qdisc_details__(struct netdev *netdev,
2472 const struct shash *details, struct htb_class *hc)
2474 const char *max_rate_s;
2476 max_rate_s = shash_find_data(details, "max-rate");
2477 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2478 if (!hc->max_rate) {
2481 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2482 hc->max_rate = netdev_features_to_bps(current) / 8;
2484 hc->min_rate = hc->max_rate;
2490 htb_parse_class_details__(struct netdev *netdev,
2491 const struct shash *details, struct htb_class *hc)
2493 const struct htb *htb = htb_get__(netdev);
2494 const char *min_rate_s = shash_find_data(details, "min-rate");
2495 const char *max_rate_s = shash_find_data(details, "max-rate");
2496 const char *burst_s = shash_find_data(details, "burst");
2497 const char *priority_s = shash_find_data(details, "priority");
2500 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2502 /* min-rate is required. */
2505 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2506 hc->min_rate = MAX(hc->min_rate, 1500);
2507 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2510 hc->max_rate = (max_rate_s
2511 ? strtoull(max_rate_s, NULL, 10) / 8
2513 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2514 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2518 * According to hints in the documentation that I've read, it is important
2519 * that 'burst' be at least as big as the largest frame that might be
2520 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2521 * but having it a bit too small is a problem. Since netdev_get_mtu()
2522 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2523 * the MTU. We actually add 64, instead of 14, as a guard against
2524 * additional headers get tacked on somewhere that we're not aware of. */
2525 netdev_get_mtu(netdev, &mtu);
2526 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2527 hc->burst = MAX(hc->burst, mtu + 64);
2530 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2536 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2537 unsigned int parent, struct htb_class *options,
2538 struct netdev_queue_stats *stats)
2540 struct ofpbuf *reply;
2543 error = tc_query_class(netdev, handle, parent, &reply);
2545 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2546 ofpbuf_delete(reply);
2552 htb_tc_install(struct netdev *netdev, const struct shash *details)
2556 error = htb_setup_qdisc__(netdev);
2558 struct htb_class hc;
2560 htb_parse_qdisc_details__(netdev, details, &hc);
2561 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2562 tc_make_handle(1, 0), &hc);
2564 htb_install__(netdev, hc.max_rate);
2570 static struct htb_class *
2571 htb_class_cast__(const struct tc_queue *queue)
2573 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2577 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2578 const struct htb_class *hc)
2580 struct htb *htb = htb_get__(netdev);
2581 size_t hash = hash_int(queue_id, 0);
2582 struct tc_queue *queue;
2583 struct htb_class *hcp;
2585 queue = tc_find_queue__(netdev, queue_id, hash);
2587 hcp = htb_class_cast__(queue);
2589 hcp = xmalloc(sizeof *hcp);
2590 queue = &hcp->tc_queue;
2591 queue->queue_id = queue_id;
2592 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2595 hcp->min_rate = hc->min_rate;
2596 hcp->max_rate = hc->max_rate;
2597 hcp->burst = hc->burst;
2598 hcp->priority = hc->priority;
2602 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2605 struct nl_dump dump;
2606 struct htb_class hc;
2609 /* Get qdisc options. */
2611 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2612 htb = htb_install__(netdev, hc.max_rate);
2615 if (!start_queue_dump(netdev, &dump)) {
2618 while (nl_dump_next(&dump, &msg)) {
2619 unsigned int queue_id;
2621 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2622 htb_update_queue__(netdev, queue_id, &hc);
2625 nl_dump_done(&dump);
2631 htb_tc_destroy(struct tc *tc)
2633 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2634 struct htb_class *hc, *next;
2636 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2637 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2645 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2647 const struct htb *htb = htb_get__(netdev);
2648 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2653 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2655 struct htb_class hc;
2658 htb_parse_qdisc_details__(netdev, details, &hc);
2659 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2660 tc_make_handle(1, 0), &hc);
2662 htb_get__(netdev)->max_rate = hc.max_rate;
2668 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2669 const struct tc_queue *queue, struct shash *details)
2671 const struct htb_class *hc = htb_class_cast__(queue);
2673 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2674 if (hc->min_rate != hc->max_rate) {
2675 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2677 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2679 shash_add(details, "priority", xasprintf("%u", hc->priority));
2685 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2686 const struct shash *details)
2688 struct htb_class hc;
2691 error = htb_parse_class_details__(netdev, details, &hc);
2696 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2697 tc_make_handle(1, 0xfffe), &hc);
2702 htb_update_queue__(netdev, queue_id, &hc);
2707 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2709 struct htb_class *hc = htb_class_cast__(queue);
2710 struct htb *htb = htb_get__(netdev);
2713 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2715 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2722 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2723 struct netdev_queue_stats *stats)
2725 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2726 tc_make_handle(1, 0xfffe), NULL, stats);
2730 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2731 const struct ofpbuf *nlmsg,
2732 netdev_dump_queue_stats_cb *cb, void *aux)
2734 struct netdev_queue_stats stats;
2735 unsigned int handle, major, minor;
2738 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2743 major = tc_get_major(handle);
2744 minor = tc_get_minor(handle);
2745 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2746 (*cb)(minor - 1, &stats, aux);
2751 static const struct tc_ops tc_ops_htb = {
2752 "htb", /* linux_name */
2753 "linux-htb", /* ovs_name */
2754 HTB_N_QUEUES, /* n_queues */
2763 htb_class_get_stats,
2764 htb_class_dump_stats
2767 /* "linux-hfsc" traffic control class. */
2769 #define HFSC_N_QUEUES 0xf000
2777 struct tc_queue tc_queue;
2782 static struct hfsc *
2783 hfsc_get__(const struct netdev *netdev)
2785 struct netdev_dev_linux *netdev_dev;
2786 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2787 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2790 static struct hfsc_class *
2791 hfsc_class_cast__(const struct tc_queue *queue)
2793 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2796 static struct hfsc *
2797 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2799 struct netdev_dev_linux * netdev_dev;
2802 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2803 hfsc = xmalloc(sizeof *hfsc);
2804 tc_init(&hfsc->tc, &tc_ops_hfsc);
2805 hfsc->max_rate = max_rate;
2806 netdev_dev->tc = &hfsc->tc;
2812 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2813 const struct hfsc_class *hc)
2817 struct hfsc_class *hcp;
2818 struct tc_queue *queue;
2820 hfsc = hfsc_get__(netdev);
2821 hash = hash_int(queue_id, 0);
2823 queue = tc_find_queue__(netdev, queue_id, hash);
2825 hcp = hfsc_class_cast__(queue);
2827 hcp = xmalloc(sizeof *hcp);
2828 queue = &hcp->tc_queue;
2829 queue->queue_id = queue_id;
2830 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2833 hcp->min_rate = hc->min_rate;
2834 hcp->max_rate = hc->max_rate;
2838 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2840 const struct tc_service_curve *rsc, *fsc, *usc;
2841 static const struct nl_policy tca_hfsc_policy[] = {
2843 .type = NL_A_UNSPEC,
2845 .min_len = sizeof(struct tc_service_curve),
2848 .type = NL_A_UNSPEC,
2850 .min_len = sizeof(struct tc_service_curve),
2853 .type = NL_A_UNSPEC,
2855 .min_len = sizeof(struct tc_service_curve),
2858 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2860 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2861 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2862 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2866 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2867 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2868 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2870 if (rsc->m1 != 0 || rsc->d != 0 ||
2871 fsc->m1 != 0 || fsc->d != 0 ||
2872 usc->m1 != 0 || usc->d != 0) {
2873 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2874 "Non-linear service curves are not supported.");
2878 if (rsc->m2 != fsc->m2) {
2879 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2880 "Real-time service curves are not supported ");
2884 if (rsc->m2 > usc->m2) {
2885 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2886 "Min-rate service curve is greater than "
2887 "the max-rate service curve.");
2891 class->min_rate = fsc->m2;
2892 class->max_rate = usc->m2;
2897 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2898 struct hfsc_class *options,
2899 struct netdev_queue_stats *stats)
2902 unsigned int handle;
2903 struct nlattr *nl_options;
2905 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2911 unsigned int major, minor;
2913 major = tc_get_major(handle);
2914 minor = tc_get_minor(handle);
2915 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2916 *queue_id = minor - 1;
2923 error = hfsc_parse_tca_options__(nl_options, options);
2930 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2931 unsigned int parent, struct hfsc_class *options,
2932 struct netdev_queue_stats *stats)
2935 struct ofpbuf *reply;
2937 error = tc_query_class(netdev, handle, parent, &reply);
2942 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2943 ofpbuf_delete(reply);
2948 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2949 struct hfsc_class *class)
2952 const char *max_rate_s;
2954 max_rate_s = shash_find_data(details, "max-rate");
2955 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2960 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2961 max_rate = netdev_features_to_bps(current) / 8;
2964 class->min_rate = max_rate;
2965 class->max_rate = max_rate;
2969 hfsc_parse_class_details__(struct netdev *netdev,
2970 const struct shash *details,
2971 struct hfsc_class * class)
2973 const struct hfsc *hfsc;
2974 uint32_t min_rate, max_rate;
2975 const char *min_rate_s, *max_rate_s;
2977 hfsc = hfsc_get__(netdev);
2978 min_rate_s = shash_find_data(details, "min-rate");
2979 max_rate_s = shash_find_data(details, "max-rate");
2985 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2986 min_rate = MAX(min_rate, 1500);
2987 min_rate = MIN(min_rate, hfsc->max_rate);
2989 max_rate = (max_rate_s
2990 ? strtoull(max_rate_s, NULL, 10) / 8
2992 max_rate = MAX(max_rate, min_rate);
2993 max_rate = MIN(max_rate, hfsc->max_rate);
2995 class->min_rate = min_rate;
2996 class->max_rate = max_rate;
3001 /* Create an HFSC qdisc.
3003 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3005 hfsc_setup_qdisc__(struct netdev * netdev)
3007 struct tcmsg *tcmsg;
3008 struct ofpbuf request;
3009 struct tc_hfsc_qopt opt;
3011 tc_del_qdisc(netdev);
3013 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3014 NLM_F_EXCL | NLM_F_CREATE, &request);
3020 tcmsg->tcm_handle = tc_make_handle(1, 0);
3021 tcmsg->tcm_parent = TC_H_ROOT;
3023 memset(&opt, 0, sizeof opt);
3026 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3027 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3029 return tc_transact(&request, NULL);
3032 /* Create an HFSC class.
3034 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3035 * sc rate <min_rate> ul rate <max_rate>" */
3037 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3038 unsigned int parent, struct hfsc_class *class)
3042 struct tcmsg *tcmsg;
3043 struct ofpbuf request;
3044 struct tc_service_curve min, max;
3046 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3052 tcmsg->tcm_handle = handle;
3053 tcmsg->tcm_parent = parent;
3057 min.m2 = class->min_rate;
3061 max.m2 = class->max_rate;
3063 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3064 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3065 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3066 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3067 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3068 nl_msg_end_nested(&request, opt_offset);
3070 error = tc_transact(&request, NULL);
3072 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3073 "min-rate %ubps, max-rate %ubps (%s)",
3074 netdev_get_name(netdev),
3075 tc_get_major(handle), tc_get_minor(handle),
3076 tc_get_major(parent), tc_get_minor(parent),
3077 class->min_rate, class->max_rate, strerror(error));
3084 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3087 struct hfsc_class class;
3089 error = hfsc_setup_qdisc__(netdev);
3095 hfsc_parse_qdisc_details__(netdev, details, &class);
3096 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3097 tc_make_handle(1, 0), &class);
3103 hfsc_install__(netdev, class.max_rate);
3108 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3112 struct nl_dump dump;
3113 struct hfsc_class hc;
3116 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3117 hfsc = hfsc_install__(netdev, hc.max_rate);
3119 if (!start_queue_dump(netdev, &dump)) {
3123 while (nl_dump_next(&dump, &msg)) {
3124 unsigned int queue_id;
3126 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3127 hfsc_update_queue__(netdev, queue_id, &hc);
3131 nl_dump_done(&dump);
3136 hfsc_tc_destroy(struct tc *tc)
3139 struct hfsc_class *hc, *next;
3141 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3143 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3144 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3153 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3155 const struct hfsc *hfsc;
3156 hfsc = hfsc_get__(netdev);
3157 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3162 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3165 struct hfsc_class class;
3167 hfsc_parse_qdisc_details__(netdev, details, &class);
3168 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3169 tc_make_handle(1, 0), &class);
3172 hfsc_get__(netdev)->max_rate = class.max_rate;
3179 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3180 const struct tc_queue *queue, struct shash *details)
3182 const struct hfsc_class *hc;
3184 hc = hfsc_class_cast__(queue);
3185 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3186 if (hc->min_rate != hc->max_rate) {
3187 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3193 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3194 const struct shash *details)
3197 struct hfsc_class class;
3199 error = hfsc_parse_class_details__(netdev, details, &class);
3204 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3205 tc_make_handle(1, 0xfffe), &class);
3210 hfsc_update_queue__(netdev, queue_id, &class);
3215 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3219 struct hfsc_class *hc;
3221 hc = hfsc_class_cast__(queue);
3222 hfsc = hfsc_get__(netdev);
3224 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3226 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3233 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3234 struct netdev_queue_stats *stats)
3236 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3237 tc_make_handle(1, 0xfffe), NULL, stats);
3241 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3242 const struct ofpbuf *nlmsg,
3243 netdev_dump_queue_stats_cb *cb, void *aux)
3245 struct netdev_queue_stats stats;
3246 unsigned int handle, major, minor;
3249 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3254 major = tc_get_major(handle);
3255 minor = tc_get_minor(handle);
3256 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3257 (*cb)(minor - 1, &stats, aux);
3262 static const struct tc_ops tc_ops_hfsc = {
3263 "hfsc", /* linux_name */
3264 "linux-hfsc", /* ovs_name */
3265 HFSC_N_QUEUES, /* n_queues */
3266 hfsc_tc_install, /* tc_install */
3267 hfsc_tc_load, /* tc_load */
3268 hfsc_tc_destroy, /* tc_destroy */
3269 hfsc_qdisc_get, /* qdisc_get */
3270 hfsc_qdisc_set, /* qdisc_set */
3271 hfsc_class_get, /* class_get */
3272 hfsc_class_set, /* class_set */
3273 hfsc_class_delete, /* class_delete */
3274 hfsc_class_get_stats, /* class_get_stats */
3275 hfsc_class_dump_stats /* class_dump_stats */
3278 /* "linux-default" traffic control class.
3280 * This class represents the default, unnamed Linux qdisc. It corresponds to
3281 * the "" (empty string) QoS type in the OVS database. */
3284 default_install__(struct netdev *netdev)
3286 struct netdev_dev_linux *netdev_dev =
3287 netdev_dev_linux_cast(netdev_get_dev(netdev));
3288 static struct tc *tc;
3291 tc = xmalloc(sizeof *tc);
3292 tc_init(tc, &tc_ops_default);
3294 netdev_dev->tc = tc;
3298 default_tc_install(struct netdev *netdev,
3299 const struct shash *details OVS_UNUSED)
3301 default_install__(netdev);
3306 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3308 default_install__(netdev);
3312 static const struct tc_ops tc_ops_default = {
3313 NULL, /* linux_name */
3318 NULL, /* tc_destroy */
3319 NULL, /* qdisc_get */
3320 NULL, /* qdisc_set */
3321 NULL, /* class_get */
3322 NULL, /* class_set */
3323 NULL, /* class_delete */
3324 NULL, /* class_get_stats */
3325 NULL /* class_dump_stats */
3328 /* "linux-other" traffic control class.
3333 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3335 struct netdev_dev_linux *netdev_dev =
3336 netdev_dev_linux_cast(netdev_get_dev(netdev));
3337 static struct tc *tc;
3340 tc = xmalloc(sizeof *tc);
3341 tc_init(tc, &tc_ops_other);
3343 netdev_dev->tc = tc;
3347 static const struct tc_ops tc_ops_other = {
3348 NULL, /* linux_name */
3349 "linux-other", /* ovs_name */
3351 NULL, /* tc_install */
3353 NULL, /* tc_destroy */
3354 NULL, /* qdisc_get */
3355 NULL, /* qdisc_set */
3356 NULL, /* class_get */
3357 NULL, /* class_set */
3358 NULL, /* class_delete */
3359 NULL, /* class_get_stats */
3360 NULL /* class_dump_stats */
3363 /* Traffic control. */
3365 /* Number of kernel "tc" ticks per second. */
3366 static double ticks_per_s;
3368 /* Number of kernel "jiffies" per second. This is used for the purpose of
3369 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3370 * one jiffy's worth of data.
3372 * There are two possibilities here:
3374 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3375 * approximate range of 100 to 1024. That means that we really need to
3376 * make sure that the qdisc can buffer that much data.
3378 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3379 * has finely granular timers and there's no need to fudge additional room
3380 * for buffers. (There's no extra effort needed to implement that: the
3381 * large 'buffer_hz' is used as a divisor, so practically any number will
3382 * come out as 0 in the division. Small integer results in the case of
3383 * really high dividends won't have any real effect anyhow.)
3385 static unsigned int buffer_hz;
3387 /* Returns tc handle 'major':'minor'. */
3389 tc_make_handle(unsigned int major, unsigned int minor)
3391 return TC_H_MAKE(major << 16, minor);
3394 /* Returns the major number from 'handle'. */
3396 tc_get_major(unsigned int handle)
3398 return TC_H_MAJ(handle) >> 16;
3401 /* Returns the minor number from 'handle'. */
3403 tc_get_minor(unsigned int handle)
3405 return TC_H_MIN(handle);
3408 static struct tcmsg *
3409 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3410 struct ofpbuf *request)
3412 struct tcmsg *tcmsg;
3416 error = get_ifindex(netdev, &ifindex);
3421 ofpbuf_init(request, 512);
3422 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3423 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3424 tcmsg->tcm_family = AF_UNSPEC;
3425 tcmsg->tcm_ifindex = ifindex;
3426 /* Caller should fill in tcmsg->tcm_handle. */
3427 /* Caller should fill in tcmsg->tcm_parent. */
3433 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3435 int error = nl_sock_transact(rtnl_sock, request, replyp);
3436 ofpbuf_uninit(request);
3443 /* The values in psched are not individually very meaningful, but they are
3444 * important. The tables below show some values seen in the wild.
3448 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3449 * (Before that, there are hints that it was 1000000000.)
3451 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3455 * -----------------------------------
3456 * [1] 000c8000 000f4240 000f4240 00000064
3457 * [2] 000003e8 00000400 000f4240 3b9aca00
3458 * [3] 000003e8 00000400 000f4240 3b9aca00
3459 * [4] 000003e8 00000400 000f4240 00000064
3460 * [5] 000003e8 00000040 000f4240 3b9aca00
3461 * [6] 000003e8 00000040 000f4240 000000f9
3463 * a b c d ticks_per_s buffer_hz
3464 * ------- --------- ---------- ------------- ----------- -------------
3465 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3466 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3467 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3468 * [4] 1,000 1,024 1,000,000 100 976,562 100
3469 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3470 * [6] 1,000 64 1,000,000 249 15,625,000 249
3472 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3473 * [2] 2.6.26-1-686-bigmem from Debian lenny
3474 * [3] 2.6.26-2-sparc64 from Debian lenny
3475 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3476 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3477 * [6] 2.6.34 from kernel.org on KVM
3479 static const char fn[] = "/proc/net/psched";
3480 unsigned int a, b, c, d;
3486 stream = fopen(fn, "r");
3488 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3492 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3493 VLOG_WARN("%s: read failed", fn);
3497 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3501 VLOG_WARN("%s: invalid scheduler parameters", fn);
3505 ticks_per_s = (double) a * c / b;
3509 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3512 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3515 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3516 * rate of 'rate' bytes per second. */
3518 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3523 return (rate * ticks) / ticks_per_s;
3526 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3527 * rate of 'rate' bytes per second. */
3529 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3534 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3537 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3538 * a transmission rate of 'rate' bytes per second. */
3540 tc_buffer_per_jiffy(unsigned int rate)
3545 return rate / buffer_hz;
3548 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3549 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3550 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3551 * stores NULL into it if it is absent.
3553 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3556 * Returns 0 if successful, otherwise a positive errno value. */
3558 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3559 struct nlattr **options)
3561 static const struct nl_policy tca_policy[] = {
3562 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3563 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3565 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3567 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3568 tca_policy, ta, ARRAY_SIZE(ta))) {
3569 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3574 *kind = nl_attr_get_string(ta[TCA_KIND]);
3578 *options = ta[TCA_OPTIONS];
3593 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3594 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3595 * into '*options', and its queue statistics into '*stats'. Any of the output
3596 * arguments may be null.
3598 * Returns 0 if successful, otherwise a positive errno value. */
3600 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3601 struct nlattr **options, struct netdev_queue_stats *stats)
3603 static const struct nl_policy tca_policy[] = {
3604 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3605 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3607 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3609 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3610 tca_policy, ta, ARRAY_SIZE(ta))) {
3611 VLOG_WARN_RL(&rl, "failed to parse class message");
3616 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3617 *handlep = tc->tcm_handle;
3621 *options = ta[TCA_OPTIONS];
3625 const struct gnet_stats_queue *gsq;
3626 struct gnet_stats_basic gsb;
3628 static const struct nl_policy stats_policy[] = {
3629 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3630 .min_len = sizeof gsb },
3631 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3632 .min_len = sizeof *gsq },
3634 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3636 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3637 sa, ARRAY_SIZE(sa))) {
3638 VLOG_WARN_RL(&rl, "failed to parse class stats");
3642 /* Alignment issues screw up the length of struct gnet_stats_basic on
3643 * some arch/bitsize combinations. Newer versions of Linux have a
3644 * struct gnet_stats_basic_packed, but we can't depend on that. The
3645 * easiest thing to do is just to make a copy. */
3646 memset(&gsb, 0, sizeof gsb);
3647 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3648 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3649 stats->tx_bytes = gsb.bytes;
3650 stats->tx_packets = gsb.packets;
3652 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3653 stats->tx_errors = gsq->drops;
3663 memset(stats, 0, sizeof *stats);
3668 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3671 tc_query_class(const struct netdev *netdev,
3672 unsigned int handle, unsigned int parent,
3673 struct ofpbuf **replyp)
3675 struct ofpbuf request;
3676 struct tcmsg *tcmsg;
3679 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3683 tcmsg->tcm_handle = handle;
3684 tcmsg->tcm_parent = parent;
3686 error = tc_transact(&request, replyp);
3688 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3689 netdev_get_name(netdev),
3690 tc_get_major(handle), tc_get_minor(handle),
3691 tc_get_major(parent), tc_get_minor(parent),
3697 /* Equivalent to "tc class del dev <name> handle <handle>". */
3699 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3701 struct ofpbuf request;
3702 struct tcmsg *tcmsg;
3705 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3709 tcmsg->tcm_handle = handle;
3710 tcmsg->tcm_parent = 0;
3712 error = tc_transact(&request, NULL);
3714 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3715 netdev_get_name(netdev),
3716 tc_get_major(handle), tc_get_minor(handle),
3722 /* Equivalent to "tc qdisc del dev <name> root". */
3724 tc_del_qdisc(struct netdev *netdev)
3726 struct netdev_dev_linux *netdev_dev =
3727 netdev_dev_linux_cast(netdev_get_dev(netdev));
3728 struct ofpbuf request;
3729 struct tcmsg *tcmsg;
3732 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3736 tcmsg->tcm_handle = tc_make_handle(1, 0);
3737 tcmsg->tcm_parent = TC_H_ROOT;
3739 error = tc_transact(&request, NULL);
3740 if (error == EINVAL) {
3741 /* EINVAL probably means that the default qdisc was in use, in which
3742 * case we've accomplished our purpose. */
3745 if (!error && netdev_dev->tc) {
3746 if (netdev_dev->tc->ops->tc_destroy) {
3747 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3749 netdev_dev->tc = NULL;
3754 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3755 * kernel to determine what they are. Returns 0 if successful, otherwise a
3756 * positive errno value. */
3758 tc_query_qdisc(const struct netdev *netdev)
3760 struct netdev_dev_linux *netdev_dev =
3761 netdev_dev_linux_cast(netdev_get_dev(netdev));
3762 struct ofpbuf request, *qdisc;
3763 const struct tc_ops *ops;
3764 struct tcmsg *tcmsg;
3768 if (netdev_dev->tc) {
3772 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3773 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3774 * 2.6.35 without that fix backported to it.
3776 * To avoid the OOPS, we must not make a request that would attempt to dump
3777 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3778 * few others. There are a few ways that I can see to do this, but most of
3779 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3780 * technique chosen here is to assume that any non-default qdisc that we
3781 * create will have a class with handle 1:0. The built-in qdiscs only have
3782 * a class with handle 0:0.
3784 * We could check for Linux 2.6.35+ and use a more straightforward method
3786 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3790 tcmsg->tcm_handle = tc_make_handle(1, 0);
3791 tcmsg->tcm_parent = 0;
3793 /* Figure out what tc class to instantiate. */
3794 error = tc_transact(&request, &qdisc);
3798 error = tc_parse_qdisc(qdisc, &kind, NULL);
3800 ops = &tc_ops_other;
3802 ops = tc_lookup_linux_name(kind);
3804 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3805 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3807 ops = &tc_ops_other;
3810 } else if (error == ENOENT) {
3811 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3812 * other entity that doesn't have a handle 1:0. We will assume
3813 * that it's the system default qdisc. */
3814 ops = &tc_ops_default;
3817 /* Who knows? Maybe the device got deleted. */
3818 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3819 netdev_get_name(netdev), strerror(error));
3820 ops = &tc_ops_other;
3823 /* Instantiate it. */
3824 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3825 assert((load_error == 0) == (netdev_dev->tc != NULL));
3826 ofpbuf_delete(qdisc);
3828 return error ? error : load_error;
3831 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3832 approximate the time to transmit packets of various lengths. For an MTU of
3833 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3834 represents two possible packet lengths; for a MTU of 513 through 1024, four
3835 possible lengths; and so on.
3837 Returns, for the specified 'mtu', the number of bits that packet lengths
3838 need to be shifted right to fit within such a 256-entry table. */
3840 tc_calc_cell_log(unsigned int mtu)
3845 mtu = ETH_PAYLOAD_MAX;
3847 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3849 for (cell_log = 0; mtu >= 256; cell_log++) {
3856 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3859 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3861 memset(rate, 0, sizeof *rate);
3862 rate->cell_log = tc_calc_cell_log(mtu);
3863 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3864 /* rate->cell_align = 0; */ /* distro headers. */
3865 rate->mpu = ETH_TOTAL_MIN;
3869 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3870 * attribute of the specified "type".
3872 * See tc_calc_cell_log() above for a description of "rtab"s. */
3874 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3879 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3880 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3881 unsigned packet_size = (i + 1) << rate->cell_log;
3882 if (packet_size < rate->mpu) {
3883 packet_size = rate->mpu;
3885 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3889 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3890 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3891 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3894 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3896 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3897 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3901 /* Utility functions. */
3904 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3906 /* Policy for RTNLGRP_LINK messages.
3908 * There are *many* more fields in these messages, but currently we only
3909 * care about these fields. */
3910 static const struct nl_policy rtnlgrp_link_policy[] = {
3911 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3912 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3913 .min_len = sizeof(struct rtnl_link_stats) },
3916 struct ofpbuf request;
3917 struct ofpbuf *reply;
3918 struct ifinfomsg *ifi;
3919 const struct rtnl_link_stats *rtnl_stats;
3920 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3923 ofpbuf_init(&request, 0);
3924 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3925 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3926 ifi->ifi_family = PF_UNSPEC;
3927 ifi->ifi_index = ifindex;
3928 error = nl_sock_transact(rtnl_sock, &request, &reply);
3929 ofpbuf_uninit(&request);
3934 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3935 rtnlgrp_link_policy,
3936 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3937 ofpbuf_delete(reply);
3941 if (!attrs[IFLA_STATS]) {
3942 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3943 ofpbuf_delete(reply);
3947 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3948 stats->rx_packets = rtnl_stats->rx_packets;
3949 stats->tx_packets = rtnl_stats->tx_packets;
3950 stats->rx_bytes = rtnl_stats->rx_bytes;
3951 stats->tx_bytes = rtnl_stats->tx_bytes;
3952 stats->rx_errors = rtnl_stats->rx_errors;
3953 stats->tx_errors = rtnl_stats->tx_errors;
3954 stats->rx_dropped = rtnl_stats->rx_dropped;
3955 stats->tx_dropped = rtnl_stats->tx_dropped;
3956 stats->multicast = rtnl_stats->multicast;
3957 stats->collisions = rtnl_stats->collisions;
3958 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3959 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3960 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3961 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3962 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3963 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3964 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3965 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3966 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3967 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3968 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3970 ofpbuf_delete(reply);
3976 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3978 static const char fn[] = "/proc/net/dev";
3983 stream = fopen(fn, "r");
3985 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3990 while (fgets(line, sizeof line, stream)) {
3993 #define X64 "%"SCNu64
3996 X64 X64 X64 X64 X64 X64 X64 "%*u"
3997 X64 X64 X64 X64 X64 X64 X64 "%*u",
4003 &stats->rx_fifo_errors,
4004 &stats->rx_frame_errors,
4010 &stats->tx_fifo_errors,
4012 &stats->tx_carrier_errors) != 15) {
4013 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4014 } else if (!strcmp(devname, netdev_name)) {
4015 stats->rx_length_errors = UINT64_MAX;
4016 stats->rx_over_errors = UINT64_MAX;
4017 stats->rx_crc_errors = UINT64_MAX;
4018 stats->rx_missed_errors = UINT64_MAX;
4019 stats->tx_aborted_errors = UINT64_MAX;
4020 stats->tx_heartbeat_errors = UINT64_MAX;
4021 stats->tx_window_errors = UINT64_MAX;
4027 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4033 get_flags(const struct netdev *netdev, int *flags)
4038 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4040 *flags = ifr.ifr_flags;
4045 set_flags(struct netdev *netdev, int flags)
4049 ifr.ifr_flags = flags;
4050 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4055 do_get_ifindex(const char *netdev_name)
4059 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4060 COVERAGE_INC(netdev_get_ifindex);
4061 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4062 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4063 netdev_name, strerror(errno));
4066 return ifr.ifr_ifindex;
4070 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4072 struct netdev_dev_linux *netdev_dev =
4073 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4075 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4076 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4080 netdev_dev->cache_valid |= VALID_IFINDEX;
4081 netdev_dev->ifindex = ifindex;
4083 *ifindexp = netdev_dev->ifindex;
4088 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4093 memset(&ifr, 0, sizeof ifr);
4094 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4095 COVERAGE_INC(netdev_get_hwaddr);
4096 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4097 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4098 netdev_name, strerror(errno));
4101 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4102 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4103 VLOG_WARN("%s device has unknown hardware address family %d",
4104 netdev_name, hwaddr_family);
4106 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4111 set_etheraddr(const char *netdev_name, int hwaddr_family,
4112 const uint8_t mac[ETH_ADDR_LEN])
4116 memset(&ifr, 0, sizeof ifr);
4117 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4118 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4119 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4120 COVERAGE_INC(netdev_set_hwaddr);
4121 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4122 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4123 netdev_name, strerror(errno));
4130 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4131 int cmd, const char *cmd_name)
4135 memset(&ifr, 0, sizeof ifr);
4136 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4137 ifr.ifr_data = (caddr_t) ecmd;
4140 COVERAGE_INC(netdev_ethtool);
4141 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4144 if (errno != EOPNOTSUPP) {
4145 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4146 "failed: %s", cmd_name, name, strerror(errno));
4148 /* The device doesn't support this operation. That's pretty
4149 * common, so there's no point in logging anything. */
4156 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4157 const char *cmd_name)
4159 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4160 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4161 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4169 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4170 int cmd, const char *cmd_name)
4175 ifr.ifr_addr.sa_family = AF_INET;
4176 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4178 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4179 *ip = sin->sin_addr;