2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/mii.h>
29 #include <linux/pkt_sched.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sockios.h>
32 #include <linux/version.h>
33 #include <sys/types.h>
34 #include <sys/ioctl.h>
35 #include <sys/socket.h>
36 #include <netpacket/packet.h>
37 #include <net/ethernet.h>
39 #include <linux/if_tunnel.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
50 #include "dynamic-string.h"
51 #include "fatal-signal.h"
54 #include "netdev-provider.h"
55 #include "netdev-vport.h"
57 #include "netlink-socket.h"
59 #include "openflow/openflow.h"
61 #include "poll-loop.h"
62 #include "rtnetlink.h"
63 #include "rtnetlink-link.h"
64 #include "socket-util.h"
69 VLOG_DEFINE_THIS_MODULE(netdev_linux);
71 COVERAGE_DEFINE(netdev_get_vlan_vid);
72 COVERAGE_DEFINE(netdev_set_policing);
73 COVERAGE_DEFINE(netdev_arp_lookup);
74 COVERAGE_DEFINE(netdev_get_ifindex);
75 COVERAGE_DEFINE(netdev_get_hwaddr);
76 COVERAGE_DEFINE(netdev_set_hwaddr);
77 COVERAGE_DEFINE(netdev_ethtool);
79 /* These were introduced in Linux 2.6.14, so they might be missing if we have
81 #ifndef ADVERTISED_Pause
82 #define ADVERTISED_Pause (1 << 13)
84 #ifndef ADVERTISED_Asym_Pause
85 #define ADVERTISED_Asym_Pause (1 << 14)
88 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
91 #define TC_RTAB_SIZE 1024
94 static struct rtnetlink_notifier netdev_linux_cache_notifier;
95 static int cache_notifier_refcount;
98 VALID_IFINDEX = 1 << 0,
99 VALID_ETHERADDR = 1 << 1,
103 VALID_CARRIER = 1 << 5,
104 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
105 VALID_POLICING = 1 << 7,
106 VALID_HAVE_VPORT_STATS = 1 << 8
114 /* Traffic control. */
116 /* An instance of a traffic control class. Always associated with a particular
119 * Each TC implementation subclasses this with whatever additional data it
122 const struct tc_ops *ops;
123 struct hmap queues; /* Contains "struct tc_queue"s.
124 * Read by generic TC layer.
125 * Written only by TC implementation. */
128 /* One traffic control queue.
130 * Each TC implementation subclasses this with whatever additional data it
133 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
134 unsigned int queue_id; /* OpenFlow queue ID. */
137 /* A particular kind of traffic control. Each implementation generally maps to
138 * one particular Linux qdisc class.
140 * The functions below return 0 if successful or a positive errno value on
141 * failure, except where otherwise noted. All of them must be provided, except
142 * where otherwise noted. */
144 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
145 * This is null for tc_ops_default and tc_ops_other, for which there are no
146 * appropriate values. */
147 const char *linux_name;
149 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
150 const char *ovs_name;
152 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
153 * queues. The queues are numbered 0 through n_queues - 1. */
154 unsigned int n_queues;
156 /* Called to install this TC class on 'netdev'. The implementation should
157 * make the Netlink calls required to set up 'netdev' with the right qdisc
158 * and configure it according to 'details'. The implementation may assume
159 * that the current qdisc is the default; that is, there is no need for it
160 * to delete the current qdisc before installing itself.
162 * The contents of 'details' should be documented as valid for 'ovs_name'
163 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
164 * (which is built as ovs-vswitchd.conf.db(8)).
166 * This function must return 0 if and only if it sets 'netdev->tc' to an
167 * initialized 'struct tc'.
169 * (This function is null for tc_ops_other, which cannot be installed. For
170 * other TC classes it should always be nonnull.) */
171 int (*tc_install)(struct netdev *netdev, const struct shash *details);
173 /* Called when the netdev code determines (through a Netlink query) that
174 * this TC class's qdisc is installed on 'netdev', but we didn't install
175 * it ourselves and so don't know any of the details.
177 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
178 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
179 * implementation should parse the other attributes of 'nlmsg' as
180 * necessary to determine its configuration. If necessary it should also
181 * use Netlink queries to determine the configuration of queues on
184 * This function must return 0 if and only if it sets 'netdev->tc' to an
185 * initialized 'struct tc'. */
186 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
188 /* Destroys the data structures allocated by the implementation as part of
189 * 'tc'. (This includes destroying 'tc->queues' by calling
192 * The implementation should not need to perform any Netlink calls. If
193 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
194 * (But it may not be desirable.)
196 * This function may be null if 'tc' is trivial. */
197 void (*tc_destroy)(struct tc *tc);
199 /* Retrieves details of 'netdev->tc' configuration into 'details'.
201 * The implementation should not need to perform any Netlink calls, because
202 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
203 * cached the configuration.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function may be null if 'tc' is not configurable.
211 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
213 /* Reconfigures 'netdev->tc' according to 'details', performing any
214 * required Netlink calls to complete the reconfiguration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_set)(struct netdev *, const struct shash *details);
224 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
225 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "Queue" table in
229 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
231 * The implementation should not need to perform any Netlink calls, because
232 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
233 * cached the queue configuration.
235 * This function may be null if 'tc' does not have queues ('n_queues' is
237 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
238 struct shash *details);
240 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
241 * 'details', perfoming any required Netlink calls to complete the
242 * reconfiguration. The caller ensures that 'queue_id' is less than
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "Queue" table in
247 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
249 * This function may be null if 'tc' does not have queues or its queues are
250 * not configurable. */
251 int (*class_set)(struct netdev *, unsigned int queue_id,
252 const struct shash *details);
254 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
255 * tc_queue's within 'netdev->tc->queues'.
257 * This function may be null if 'tc' does not have queues or its queues
258 * cannot be deleted. */
259 int (*class_delete)(struct netdev *, struct tc_queue *queue);
261 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
262 * 'struct tc_queue's within 'netdev->tc->queues'.
264 * On success, initializes '*stats'.
266 * This function may be null if 'tc' does not have queues or if it cannot
267 * report queue statistics. */
268 int (*class_get_stats)(const struct netdev *netdev,
269 const struct tc_queue *queue,
270 struct netdev_queue_stats *stats);
272 /* Extracts queue stats from 'nlmsg', which is a response to a
273 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
275 * This function may be null if 'tc' does not have queues or if it cannot
276 * report queue statistics. */
277 int (*class_dump_stats)(const struct netdev *netdev,
278 const struct ofpbuf *nlmsg,
279 netdev_dump_queue_stats_cb *cb, void *aux);
283 tc_init(struct tc *tc, const struct tc_ops *ops)
286 hmap_init(&tc->queues);
290 tc_destroy(struct tc *tc)
292 hmap_destroy(&tc->queues);
295 static const struct tc_ops tc_ops_htb;
296 static const struct tc_ops tc_ops_hfsc;
297 static const struct tc_ops tc_ops_default;
298 static const struct tc_ops tc_ops_other;
300 static const struct tc_ops *tcs[] = {
301 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
302 &tc_ops_hfsc, /* Hierarchical fair service curve. */
303 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
304 &tc_ops_other, /* Some other qdisc. */
308 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
309 static unsigned int tc_get_major(unsigned int handle);
310 static unsigned int tc_get_minor(unsigned int handle);
312 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
313 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
314 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
316 static struct tcmsg *tc_make_request(const struct netdev *, int type,
317 unsigned int flags, struct ofpbuf *);
318 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
320 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
321 struct nlattr **options);
322 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
323 struct nlattr **options,
324 struct netdev_queue_stats *);
325 static int tc_query_class(const struct netdev *,
326 unsigned int handle, unsigned int parent,
327 struct ofpbuf **replyp);
328 static int tc_delete_class(const struct netdev *, unsigned int handle);
330 static int tc_del_qdisc(struct netdev *netdev);
331 static int tc_query_qdisc(const struct netdev *netdev);
333 static int tc_calc_cell_log(unsigned int mtu);
334 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
335 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
336 const struct tc_ratespec *rate);
337 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
339 struct netdev_dev_linux {
340 struct netdev_dev netdev_dev;
342 struct shash_node *shash_node;
343 unsigned int cache_valid;
345 /* The following are figured out "on demand" only. They are only valid
346 * when the corresponding VALID_* bit in 'cache_valid' is set. */
348 uint8_t etheraddr[ETH_ADDR_LEN];
349 struct in_addr address, netmask;
353 bool is_internal; /* Is this an openvswitch internal device? */
354 bool is_tap; /* Is this a tuntap device? */
355 uint32_t kbits_rate; /* Policing data. */
356 uint32_t kbits_burst;
357 bool have_vport_stats;
361 struct tap_state tap;
365 struct netdev_linux {
366 struct netdev netdev;
370 /* An AF_INET socket (used for ioctl operations). */
371 static int af_inet_sock = -1;
373 /* A Netlink routing socket that is not subscribed to any multicast groups. */
374 static struct nl_sock *rtnl_sock;
376 struct netdev_linux_notifier {
377 struct netdev_notifier notifier;
381 static struct shash netdev_linux_notifiers =
382 SHASH_INITIALIZER(&netdev_linux_notifiers);
383 static struct rtnetlink_notifier netdev_linux_poll_notifier;
385 /* This is set pretty low because we probably won't learn anything from the
386 * additional log messages. */
387 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
389 static int netdev_linux_init(void);
391 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
392 int cmd, const char *cmd_name);
393 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
394 const char *cmd_name);
395 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
396 int cmd, const char *cmd_name);
397 static int get_flags(const struct netdev *, int *flagsp);
398 static int set_flags(struct netdev *, int flags);
399 static int do_get_ifindex(const char *netdev_name);
400 static int get_ifindex(const struct netdev *, int *ifindexp);
401 static int do_set_addr(struct netdev *netdev,
402 int ioctl_nr, const char *ioctl_name,
403 struct in_addr addr);
404 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
405 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
406 const uint8_t[ETH_ADDR_LEN]);
407 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
408 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
411 is_netdev_linux_class(const struct netdev_class *netdev_class)
413 return netdev_class->init == netdev_linux_init;
416 static struct netdev_dev_linux *
417 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
419 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
420 assert(is_netdev_linux_class(netdev_class));
422 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
425 static struct netdev_linux *
426 netdev_linux_cast(const struct netdev *netdev)
428 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
429 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
430 assert(is_netdev_linux_class(netdev_class));
432 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
436 netdev_linux_init(void)
438 static int status = -1;
440 /* Create AF_INET socket. */
441 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
442 status = af_inet_sock >= 0 ? 0 : errno;
444 VLOG_ERR("failed to create inet socket: %s", strerror(status));
447 /* Create rtnetlink socket. */
449 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
451 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
460 netdev_linux_run(void)
462 rtnetlink_link_notifier_run();
466 netdev_linux_wait(void)
468 rtnetlink_link_notifier_wait();
472 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
473 void *aux OVS_UNUSED)
475 struct netdev_dev_linux *dev;
477 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
479 const struct netdev_class *netdev_class =
480 netdev_dev_get_class(base_dev);
482 if (is_netdev_linux_class(netdev_class)) {
483 dev = netdev_dev_linux_cast(base_dev);
484 dev->cache_valid = 0;
488 struct shash device_shash;
489 struct shash_node *node;
491 shash_init(&device_shash);
492 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
493 SHASH_FOR_EACH (node, &device_shash) {
495 dev->cache_valid = 0;
497 shash_destroy(&device_shash);
501 /* Creates system and internal devices. */
503 netdev_linux_create(const struct netdev_class *class,
504 const char *name, const struct shash *args,
505 struct netdev_dev **netdev_devp)
507 struct netdev_dev_linux *netdev_dev;
510 if (!shash_is_empty(args)) {
511 VLOG_WARN("%s: arguments for %s devices should be empty",
515 if (!cache_notifier_refcount) {
516 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
517 netdev_linux_cache_cb, NULL);
522 cache_notifier_refcount++;
524 netdev_dev = xzalloc(sizeof *netdev_dev);
525 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
527 *netdev_devp = &netdev_dev->netdev_dev;
531 /* For most types of netdevs we open the device for each call of
532 * netdev_open(). However, this is not the case with tap devices,
533 * since it is only possible to open the device once. In this
534 * situation we share a single file descriptor, and consequently
535 * buffers, across all readers. Therefore once data is read it will
536 * be unavailable to other reads for tap devices. */
538 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
539 const char *name, const struct shash *args,
540 struct netdev_dev **netdev_devp)
542 struct netdev_dev_linux *netdev_dev;
543 struct tap_state *state;
544 static const char tap_dev[] = "/dev/net/tun";
548 if (!shash_is_empty(args)) {
549 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
552 netdev_dev = xzalloc(sizeof *netdev_dev);
553 state = &netdev_dev->state.tap;
555 /* Open tap device. */
556 state->fd = open(tap_dev, O_RDWR);
559 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
563 /* Create tap device. */
564 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
565 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
566 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
567 VLOG_WARN("%s: creating tap device failed: %s", name,
573 /* Make non-blocking. */
574 error = set_nonblocking(state->fd);
579 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
580 *netdev_devp = &netdev_dev->netdev_dev;
589 destroy_tap(struct netdev_dev_linux *netdev_dev)
591 struct tap_state *state = &netdev_dev->state.tap;
593 if (state->fd >= 0) {
598 /* Destroys the netdev device 'netdev_dev_'. */
600 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
602 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
603 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
605 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
606 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
609 if (class == &netdev_linux_class || class == &netdev_internal_class) {
610 cache_notifier_refcount--;
612 if (!cache_notifier_refcount) {
613 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
615 } else if (class == &netdev_tap_class) {
616 destroy_tap(netdev_dev);
625 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
626 struct netdev **netdevp)
628 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
629 struct netdev_linux *netdev;
630 enum netdev_flags flags;
633 /* Allocate network device. */
634 netdev = xzalloc(sizeof *netdev);
636 netdev_init(&netdev->netdev, netdev_dev_);
638 /* Verify that the device really exists, by attempting to read its flags.
639 * (The flags might be cached, in which case this won't actually do an
642 * Don't do this for "internal" netdevs, though, because those have to be
643 * created as netdev objects before they exist in the kernel, because
644 * creating them in the kernel happens by passing a netdev object to
645 * dpif_port_add(). */
646 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
647 error = netdev_get_flags(&netdev->netdev, &flags);
648 if (error == ENODEV) {
653 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
654 !netdev_dev->state.tap.opened) {
656 /* We assume that the first user of the tap device is the primary user
657 * and give them the tap FD. Subsequent users probably just expect
658 * this to be a system device so open it normally to avoid send/receive
659 * directions appearing to be reversed. */
660 netdev->fd = netdev_dev->state.tap.fd;
661 netdev_dev->state.tap.opened = true;
662 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
663 struct sockaddr_ll sll;
667 /* Create file descriptor. */
668 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
669 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
671 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
672 if (netdev->fd < 0) {
677 /* Set non-blocking mode. */
678 error = set_nonblocking(netdev->fd);
683 /* Get ethernet device index. */
684 error = get_ifindex(&netdev->netdev, &ifindex);
689 /* Bind to specific ethernet device. */
690 memset(&sll, 0, sizeof sll);
691 sll.sll_family = AF_PACKET;
692 sll.sll_ifindex = ifindex;
694 (struct sockaddr *) &sll, sizeof sll) < 0) {
696 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
701 /* Between the socket() and bind() calls above, the socket receives all
702 * packets of the requested type on all system interfaces. We do not
703 * want to receive that data, but there is no way to avoid it. So we
704 * must now drain out the receive queue. */
705 error = drain_rcvbuf(netdev->fd);
711 *netdevp = &netdev->netdev;
715 netdev_uninit(&netdev->netdev, true);
719 /* Closes and destroys 'netdev'. */
721 netdev_linux_close(struct netdev *netdev_)
723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
725 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
731 /* Initializes 'svec' with a list of the names of all known network devices. */
733 netdev_linux_enumerate(struct svec *svec)
735 struct if_nameindex *names;
737 names = if_nameindex();
741 for (i = 0; names[i].if_name != NULL; i++) {
742 svec_add(svec, names[i].if_name);
744 if_freenameindex(names);
747 VLOG_WARN("could not obtain list of network device names: %s",
754 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
756 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758 if (netdev->fd < 0) {
759 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
764 ssize_t retval = read(netdev->fd, data, size);
767 } else if (errno != EINTR) {
768 if (errno != EAGAIN) {
769 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
770 strerror(errno), netdev_get_name(netdev_));
777 /* Registers with the poll loop to wake up from the next call to poll_block()
778 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
780 netdev_linux_recv_wait(struct netdev *netdev_)
782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
783 if (netdev->fd >= 0) {
784 poll_fd_wait(netdev->fd, POLLIN);
788 /* Discards all packets waiting to be received from 'netdev'. */
790 netdev_linux_drain(struct netdev *netdev_)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
793 if (netdev->fd < 0) {
795 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
797 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
798 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
802 drain_fd(netdev->fd, ifr.ifr_qlen);
805 return drain_rcvbuf(netdev->fd);
809 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
810 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
811 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
812 * the packet is too big or too small to transmit on the device.
814 * The caller retains ownership of 'buffer' in all cases.
816 * The kernel maintains a packet transmission queue, so the caller is not
817 * expected to do additional queuing of packets. */
819 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
821 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
823 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
825 if (netdev->fd < 0) {
830 ssize_t retval = write(netdev->fd, data, size);
832 /* The Linux AF_PACKET implementation never blocks waiting for room
833 * for packets, instead returning ENOBUFS. Translate this into
834 * EAGAIN for the caller. */
835 if (errno == ENOBUFS) {
837 } else if (errno == EINTR) {
839 } else if (errno != EAGAIN) {
840 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
841 netdev_get_name(netdev_), strerror(errno));
844 } else if (retval != size) {
845 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
846 "%zu) on %s", retval, size, netdev_get_name(netdev_));
854 /* Registers with the poll loop to wake up from the next call to poll_block()
855 * when the packet transmission queue has sufficient room to transmit a packet
856 * with netdev_send().
858 * The kernel maintains a packet transmission queue, so the client is not
859 * expected to do additional queuing of packets. Thus, this function is
860 * unlikely to ever be used. It is included for completeness. */
862 netdev_linux_send_wait(struct netdev *netdev_)
864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
865 if (netdev->fd < 0) {
867 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
868 poll_fd_wait(netdev->fd, POLLOUT);
870 /* TAP device always accepts packets.*/
871 poll_immediate_wake();
875 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
876 * otherwise a positive errno value. */
878 netdev_linux_set_etheraddr(struct netdev *netdev_,
879 const uint8_t mac[ETH_ADDR_LEN])
881 struct netdev_dev_linux *netdev_dev =
882 netdev_dev_linux_cast(netdev_get_dev(netdev_));
885 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
886 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
887 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
889 netdev_dev->cache_valid |= VALID_ETHERADDR;
890 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
898 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
899 * free the returned buffer. */
901 netdev_linux_get_etheraddr(const struct netdev *netdev_,
902 uint8_t mac[ETH_ADDR_LEN])
904 struct netdev_dev_linux *netdev_dev =
905 netdev_dev_linux_cast(netdev_get_dev(netdev_));
906 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
907 int error = get_etheraddr(netdev_get_name(netdev_),
908 netdev_dev->etheraddr);
912 netdev_dev->cache_valid |= VALID_ETHERADDR;
914 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
918 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
919 * in bytes, not including the hardware header; thus, this is typically 1500
920 * bytes for Ethernet devices. */
922 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
924 struct netdev_dev_linux *netdev_dev =
925 netdev_dev_linux_cast(netdev_get_dev(netdev_));
926 if (!(netdev_dev->cache_valid & VALID_MTU)) {
930 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
931 SIOCGIFMTU, "SIOCGIFMTU");
935 netdev_dev->mtu = ifr.ifr_mtu;
936 netdev_dev->cache_valid |= VALID_MTU;
938 *mtup = netdev_dev->mtu;
942 /* Returns the ifindex of 'netdev', if successful, as a positive number.
943 * On failure, returns a negative errno value. */
945 netdev_linux_get_ifindex(const struct netdev *netdev)
949 error = get_ifindex(netdev, &ifindex);
950 return error ? -error : ifindex;
954 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
956 struct netdev_dev_linux *netdev_dev =
957 netdev_dev_linux_cast(netdev_get_dev(netdev_));
962 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
966 fn = xasprintf("/sys/class/net/%s/carrier",
967 netdev_get_name(netdev_));
968 fd = open(fn, O_RDONLY);
971 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
975 retval = read(fd, line, sizeof line);
978 if (error == EINVAL) {
979 /* This is the normal return value when we try to check carrier
980 * if the network device is not up. */
982 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
985 } else if (retval == 0) {
987 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
991 if (line[0] != '0' && line[0] != '1') {
993 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
997 netdev_dev->carrier = line[0] != '0';
998 netdev_dev->cache_valid |= VALID_CARRIER;
1000 *carrier = netdev_dev->carrier;
1012 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1013 const char *cmd_name, struct mii_ioctl_data *data)
1018 memset(&ifr, 0, sizeof ifr);
1019 memcpy(&ifr.ifr_data, data, sizeof *data);
1020 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1021 &ifr, cmd, cmd_name);
1022 memcpy(data, &ifr.ifr_data, sizeof *data);
1028 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1030 const char *name = netdev_get_name(netdev);
1031 struct mii_ioctl_data data;
1036 memset(&data, 0, sizeof data);
1037 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1039 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1040 data.reg_num = MII_BMSR;
1041 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1045 *miimon = !!(data.val_out & BMSR_LSTATUS);
1047 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1050 struct ethtool_cmd ecmd;
1052 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1055 memset(&ecmd, 0, sizeof ecmd);
1056 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1059 struct ethtool_value eval;
1061 memcpy(&eval, &ecmd, sizeof eval);
1062 *miimon = !!eval.data;
1064 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1071 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1072 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1075 check_for_working_netlink_stats(void)
1077 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1078 * preferable, so if that works, we'll use it. */
1079 int ifindex = do_get_ifindex("lo");
1081 VLOG_WARN("failed to get ifindex for lo, "
1082 "obtaining netdev stats from proc");
1085 struct netdev_stats stats;
1086 int error = get_stats_via_netlink(ifindex, &stats);
1088 VLOG_DBG("obtaining netdev stats via rtnetlink");
1091 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1092 "via proc (you are probably running a pre-2.6.19 "
1093 "kernel)", strerror(error));
1099 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1101 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1103 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1104 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1105 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1107 netdev_dev->is_tap = !strcmp(type, "tap");
1108 netdev_dev->is_internal = false;
1109 if (!netdev_dev->is_tap) {
1110 struct ethtool_drvinfo drvinfo;
1113 memset(&drvinfo, 0, sizeof drvinfo);
1114 error = netdev_linux_do_ethtool(name,
1115 (struct ethtool_cmd *)&drvinfo,
1117 "ETHTOOL_GDRVINFO");
1119 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1120 netdev_dev->is_internal = true;
1124 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1129 swap_uint64(uint64_t *a, uint64_t *b)
1136 /* Retrieves current device stats for 'netdev'. */
1138 netdev_linux_get_stats(const struct netdev *netdev_,
1139 struct netdev_stats *stats)
1141 struct netdev_dev_linux *netdev_dev =
1142 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1143 static int use_netlink_stats = -1;
1146 if (netdev_dev->have_vport_stats ||
1147 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1149 error = netdev_vport_get_stats(netdev_, stats);
1150 netdev_dev->have_vport_stats = !error;
1151 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1154 if (!netdev_dev->have_vport_stats) {
1155 if (use_netlink_stats < 0) {
1156 use_netlink_stats = check_for_working_netlink_stats();
1158 if (use_netlink_stats) {
1161 error = get_ifindex(netdev_, &ifindex);
1163 error = get_stats_via_netlink(ifindex, stats);
1166 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1170 /* If this port is an internal port then the transmit and receive stats
1171 * will appear to be swapped relative to the other ports since we are the
1172 * one sending the data, not a remote computer. For consistency, we swap
1173 * them back here. This does not apply if we are getting stats from the
1174 * vport layer because it always tracks stats from the perspective of the
1176 netdev_linux_update_is_pseudo(netdev_dev);
1177 if (!error && !netdev_dev->have_vport_stats &&
1178 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1179 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1180 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1181 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1182 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1183 stats->rx_length_errors = 0;
1184 stats->rx_over_errors = 0;
1185 stats->rx_crc_errors = 0;
1186 stats->rx_frame_errors = 0;
1187 stats->rx_fifo_errors = 0;
1188 stats->rx_missed_errors = 0;
1189 stats->tx_aborted_errors = 0;
1190 stats->tx_carrier_errors = 0;
1191 stats->tx_fifo_errors = 0;
1192 stats->tx_heartbeat_errors = 0;
1193 stats->tx_window_errors = 0;
1199 /* Stores the features supported by 'netdev' into each of '*current',
1200 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1201 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1202 * successful, otherwise a positive errno value. */
1204 netdev_linux_get_features(struct netdev *netdev,
1205 uint32_t *current, uint32_t *advertised,
1206 uint32_t *supported, uint32_t *peer)
1208 struct ethtool_cmd ecmd;
1211 memset(&ecmd, 0, sizeof ecmd);
1212 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1213 ETHTOOL_GSET, "ETHTOOL_GSET");
1218 /* Supported features. */
1220 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1221 *supported |= OFPPF_10MB_HD;
1223 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1224 *supported |= OFPPF_10MB_FD;
1226 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1227 *supported |= OFPPF_100MB_HD;
1229 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1230 *supported |= OFPPF_100MB_FD;
1232 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1233 *supported |= OFPPF_1GB_HD;
1235 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1236 *supported |= OFPPF_1GB_FD;
1238 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1239 *supported |= OFPPF_10GB_FD;
1241 if (ecmd.supported & SUPPORTED_TP) {
1242 *supported |= OFPPF_COPPER;
1244 if (ecmd.supported & SUPPORTED_FIBRE) {
1245 *supported |= OFPPF_FIBER;
1247 if (ecmd.supported & SUPPORTED_Autoneg) {
1248 *supported |= OFPPF_AUTONEG;
1250 if (ecmd.supported & SUPPORTED_Pause) {
1251 *supported |= OFPPF_PAUSE;
1253 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1254 *supported |= OFPPF_PAUSE_ASYM;
1257 /* Advertised features. */
1259 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1260 *advertised |= OFPPF_10MB_HD;
1262 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1263 *advertised |= OFPPF_10MB_FD;
1265 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1266 *advertised |= OFPPF_100MB_HD;
1268 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1269 *advertised |= OFPPF_100MB_FD;
1271 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1272 *advertised |= OFPPF_1GB_HD;
1274 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1275 *advertised |= OFPPF_1GB_FD;
1277 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1278 *advertised |= OFPPF_10GB_FD;
1280 if (ecmd.advertising & ADVERTISED_TP) {
1281 *advertised |= OFPPF_COPPER;
1283 if (ecmd.advertising & ADVERTISED_FIBRE) {
1284 *advertised |= OFPPF_FIBER;
1286 if (ecmd.advertising & ADVERTISED_Autoneg) {
1287 *advertised |= OFPPF_AUTONEG;
1289 if (ecmd.advertising & ADVERTISED_Pause) {
1290 *advertised |= OFPPF_PAUSE;
1292 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1293 *advertised |= OFPPF_PAUSE_ASYM;
1296 /* Current settings. */
1297 if (ecmd.speed == SPEED_10) {
1298 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1299 } else if (ecmd.speed == SPEED_100) {
1300 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1301 } else if (ecmd.speed == SPEED_1000) {
1302 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1303 } else if (ecmd.speed == SPEED_10000) {
1304 *current = OFPPF_10GB_FD;
1309 if (ecmd.port == PORT_TP) {
1310 *current |= OFPPF_COPPER;
1311 } else if (ecmd.port == PORT_FIBRE) {
1312 *current |= OFPPF_FIBER;
1316 *current |= OFPPF_AUTONEG;
1319 /* Peer advertisements. */
1320 *peer = 0; /* XXX */
1325 /* Set the features advertised by 'netdev' to 'advertise'. */
1327 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1329 struct ethtool_cmd ecmd;
1332 memset(&ecmd, 0, sizeof ecmd);
1333 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1334 ETHTOOL_GSET, "ETHTOOL_GSET");
1339 ecmd.advertising = 0;
1340 if (advertise & OFPPF_10MB_HD) {
1341 ecmd.advertising |= ADVERTISED_10baseT_Half;
1343 if (advertise & OFPPF_10MB_FD) {
1344 ecmd.advertising |= ADVERTISED_10baseT_Full;
1346 if (advertise & OFPPF_100MB_HD) {
1347 ecmd.advertising |= ADVERTISED_100baseT_Half;
1349 if (advertise & OFPPF_100MB_FD) {
1350 ecmd.advertising |= ADVERTISED_100baseT_Full;
1352 if (advertise & OFPPF_1GB_HD) {
1353 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1355 if (advertise & OFPPF_1GB_FD) {
1356 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1358 if (advertise & OFPPF_10GB_FD) {
1359 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1361 if (advertise & OFPPF_COPPER) {
1362 ecmd.advertising |= ADVERTISED_TP;
1364 if (advertise & OFPPF_FIBER) {
1365 ecmd.advertising |= ADVERTISED_FIBRE;
1367 if (advertise & OFPPF_AUTONEG) {
1368 ecmd.advertising |= ADVERTISED_Autoneg;
1370 if (advertise & OFPPF_PAUSE) {
1371 ecmd.advertising |= ADVERTISED_Pause;
1373 if (advertise & OFPPF_PAUSE_ASYM) {
1374 ecmd.advertising |= ADVERTISED_Asym_Pause;
1376 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1377 ETHTOOL_SSET, "ETHTOOL_SSET");
1380 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1381 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1382 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1383 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1384 * sets '*vlan_vid' to -1. */
1386 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1388 const char *netdev_name = netdev_get_name(netdev);
1389 struct ds line = DS_EMPTY_INITIALIZER;
1390 FILE *stream = NULL;
1394 COVERAGE_INC(netdev_get_vlan_vid);
1395 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1396 stream = fopen(fn, "r");
1402 if (ds_get_line(&line, stream)) {
1403 if (ferror(stream)) {
1405 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1408 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1413 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1415 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1416 fn, ds_cstr(&line));
1434 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1435 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1437 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1438 * positive errno value.
1440 * This function is equivalent to running
1441 * /sbin/tc qdisc del dev %s handle ffff: ingress
1442 * but it is much, much faster.
1445 netdev_linux_remove_policing(struct netdev *netdev)
1447 struct netdev_dev_linux *netdev_dev =
1448 netdev_dev_linux_cast(netdev_get_dev(netdev));
1449 const char *netdev_name = netdev_get_name(netdev);
1451 struct ofpbuf request;
1452 struct tcmsg *tcmsg;
1455 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1459 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1460 tcmsg->tcm_parent = TC_H_INGRESS;
1461 nl_msg_put_string(&request, TCA_KIND, "ingress");
1462 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1464 error = tc_transact(&request, NULL);
1465 if (error && error != ENOENT && error != EINVAL) {
1466 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1467 netdev_name, strerror(error));
1471 netdev_dev->kbits_rate = 0;
1472 netdev_dev->kbits_burst = 0;
1473 netdev_dev->cache_valid |= VALID_POLICING;
1477 /* Attempts to set input rate limiting (policing) policy. */
1479 netdev_linux_set_policing(struct netdev *netdev,
1480 uint32_t kbits_rate, uint32_t kbits_burst)
1482 struct netdev_dev_linux *netdev_dev =
1483 netdev_dev_linux_cast(netdev_get_dev(netdev));
1484 const char *netdev_name = netdev_get_name(netdev);
1487 COVERAGE_INC(netdev_set_policing);
1489 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1490 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1491 : kbits_burst); /* Stick with user-specified value. */
1493 if (netdev_dev->cache_valid & VALID_POLICING
1494 && netdev_dev->kbits_rate == kbits_rate
1495 && netdev_dev->kbits_burst == kbits_burst) {
1496 /* Assume that settings haven't changed since we last set them. */
1500 netdev_linux_remove_policing(netdev);
1502 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1503 if (system(command) != 0) {
1504 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1508 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1509 kbits_rate, kbits_burst);
1510 if (system(command) != 0) {
1511 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1516 netdev_dev->kbits_rate = kbits_rate;
1517 netdev_dev->kbits_burst = kbits_burst;
1518 netdev_dev->cache_valid |= VALID_POLICING;
1525 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1528 const struct tc_ops **opsp;
1530 for (opsp = tcs; *opsp != NULL; opsp++) {
1531 const struct tc_ops *ops = *opsp;
1532 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1533 svec_add(types, ops->ovs_name);
1539 static const struct tc_ops *
1540 tc_lookup_ovs_name(const char *name)
1542 const struct tc_ops **opsp;
1544 for (opsp = tcs; *opsp != NULL; opsp++) {
1545 const struct tc_ops *ops = *opsp;
1546 if (!strcmp(name, ops->ovs_name)) {
1553 static const struct tc_ops *
1554 tc_lookup_linux_name(const char *name)
1556 const struct tc_ops **opsp;
1558 for (opsp = tcs; *opsp != NULL; opsp++) {
1559 const struct tc_ops *ops = *opsp;
1560 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1567 static struct tc_queue *
1568 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1571 struct netdev_dev_linux *netdev_dev =
1572 netdev_dev_linux_cast(netdev_get_dev(netdev));
1573 struct tc_queue *queue;
1575 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1576 if (queue->queue_id == queue_id) {
1583 static struct tc_queue *
1584 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1586 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1590 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1592 struct netdev_qos_capabilities *caps)
1594 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1598 caps->n_queues = ops->n_queues;
1603 netdev_linux_get_qos(const struct netdev *netdev,
1604 const char **typep, struct shash *details)
1606 struct netdev_dev_linux *netdev_dev =
1607 netdev_dev_linux_cast(netdev_get_dev(netdev));
1610 error = tc_query_qdisc(netdev);
1615 *typep = netdev_dev->tc->ops->ovs_name;
1616 return (netdev_dev->tc->ops->qdisc_get
1617 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1622 netdev_linux_set_qos(struct netdev *netdev,
1623 const char *type, const struct shash *details)
1625 struct netdev_dev_linux *netdev_dev =
1626 netdev_dev_linux_cast(netdev_get_dev(netdev));
1627 const struct tc_ops *new_ops;
1630 new_ops = tc_lookup_ovs_name(type);
1631 if (!new_ops || !new_ops->tc_install) {
1635 error = tc_query_qdisc(netdev);
1640 if (new_ops == netdev_dev->tc->ops) {
1641 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1643 /* Delete existing qdisc. */
1644 error = tc_del_qdisc(netdev);
1648 assert(netdev_dev->tc == NULL);
1650 /* Install new qdisc. */
1651 error = new_ops->tc_install(netdev, details);
1652 assert((error == 0) == (netdev_dev->tc != NULL));
1659 netdev_linux_get_queue(const struct netdev *netdev,
1660 unsigned int queue_id, struct shash *details)
1662 struct netdev_dev_linux *netdev_dev =
1663 netdev_dev_linux_cast(netdev_get_dev(netdev));
1666 error = tc_query_qdisc(netdev);
1670 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1672 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1678 netdev_linux_set_queue(struct netdev *netdev,
1679 unsigned int queue_id, const struct shash *details)
1681 struct netdev_dev_linux *netdev_dev =
1682 netdev_dev_linux_cast(netdev_get_dev(netdev));
1685 error = tc_query_qdisc(netdev);
1688 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1689 || !netdev_dev->tc->ops->class_set) {
1693 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1697 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1699 struct netdev_dev_linux *netdev_dev =
1700 netdev_dev_linux_cast(netdev_get_dev(netdev));
1703 error = tc_query_qdisc(netdev);
1706 } else if (!netdev_dev->tc->ops->class_delete) {
1709 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1711 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1717 netdev_linux_get_queue_stats(const struct netdev *netdev,
1718 unsigned int queue_id,
1719 struct netdev_queue_stats *stats)
1721 struct netdev_dev_linux *netdev_dev =
1722 netdev_dev_linux_cast(netdev_get_dev(netdev));
1725 error = tc_query_qdisc(netdev);
1728 } else if (!netdev_dev->tc->ops->class_get_stats) {
1731 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1733 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1739 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1741 struct ofpbuf request;
1742 struct tcmsg *tcmsg;
1744 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1748 tcmsg->tcm_parent = 0;
1749 nl_dump_start(dump, rtnl_sock, &request);
1750 ofpbuf_uninit(&request);
1755 netdev_linux_dump_queues(const struct netdev *netdev,
1756 netdev_dump_queues_cb *cb, void *aux)
1758 struct netdev_dev_linux *netdev_dev =
1759 netdev_dev_linux_cast(netdev_get_dev(netdev));
1760 struct tc_queue *queue;
1761 struct shash details;
1765 error = tc_query_qdisc(netdev);
1768 } else if (!netdev_dev->tc->ops->class_get) {
1773 shash_init(&details);
1774 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1775 shash_clear(&details);
1777 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1779 (*cb)(queue->queue_id, &details, aux);
1784 shash_destroy(&details);
1790 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1791 netdev_dump_queue_stats_cb *cb, void *aux)
1793 struct netdev_dev_linux *netdev_dev =
1794 netdev_dev_linux_cast(netdev_get_dev(netdev));
1795 struct nl_dump dump;
1800 error = tc_query_qdisc(netdev);
1803 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1808 if (!start_queue_dump(netdev, &dump)) {
1811 while (nl_dump_next(&dump, &msg)) {
1812 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1818 error = nl_dump_done(&dump);
1819 return error ? error : last_error;
1823 netdev_linux_get_in4(const struct netdev *netdev_,
1824 struct in_addr *address, struct in_addr *netmask)
1826 struct netdev_dev_linux *netdev_dev =
1827 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1829 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1832 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1833 SIOCGIFADDR, "SIOCGIFADDR");
1838 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1839 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1844 netdev_dev->cache_valid |= VALID_IN4;
1846 *address = netdev_dev->address;
1847 *netmask = netdev_dev->netmask;
1848 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1852 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1853 struct in_addr netmask)
1855 struct netdev_dev_linux *netdev_dev =
1856 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1859 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1861 netdev_dev->cache_valid |= VALID_IN4;
1862 netdev_dev->address = address;
1863 netdev_dev->netmask = netmask;
1864 if (address.s_addr != INADDR_ANY) {
1865 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1866 "SIOCSIFNETMASK", netmask);
1873 parse_if_inet6_line(const char *line,
1874 struct in6_addr *in6, char ifname[16 + 1])
1876 uint8_t *s6 = in6->s6_addr;
1877 #define X8 "%2"SCNx8
1879 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1880 "%*x %*x %*x %*x %16s\n",
1881 &s6[0], &s6[1], &s6[2], &s6[3],
1882 &s6[4], &s6[5], &s6[6], &s6[7],
1883 &s6[8], &s6[9], &s6[10], &s6[11],
1884 &s6[12], &s6[13], &s6[14], &s6[15],
1888 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1889 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1891 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1893 struct netdev_dev_linux *netdev_dev =
1894 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1895 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1899 netdev_dev->in6 = in6addr_any;
1901 file = fopen("/proc/net/if_inet6", "r");
1903 const char *name = netdev_get_name(netdev_);
1904 while (fgets(line, sizeof line, file)) {
1905 struct in6_addr in6_tmp;
1906 char ifname[16 + 1];
1907 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1908 && !strcmp(name, ifname))
1910 netdev_dev->in6 = in6_tmp;
1916 netdev_dev->cache_valid |= VALID_IN6;
1918 *in6 = netdev_dev->in6;
1923 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1925 struct sockaddr_in sin;
1926 memset(&sin, 0, sizeof sin);
1927 sin.sin_family = AF_INET;
1928 sin.sin_addr = addr;
1931 memset(sa, 0, sizeof *sa);
1932 memcpy(sa, &sin, sizeof sin);
1936 do_set_addr(struct netdev *netdev,
1937 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1940 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1941 make_in4_sockaddr(&ifr.ifr_addr, addr);
1943 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1947 /* Adds 'router' as a default IP gateway. */
1949 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1951 struct in_addr any = { INADDR_ANY };
1955 memset(&rt, 0, sizeof rt);
1956 make_in4_sockaddr(&rt.rt_dst, any);
1957 make_in4_sockaddr(&rt.rt_gateway, router);
1958 make_in4_sockaddr(&rt.rt_genmask, any);
1959 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1960 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1962 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1968 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1971 static const char fn[] = "/proc/net/route";
1976 *netdev_name = NULL;
1977 stream = fopen(fn, "r");
1978 if (stream == NULL) {
1979 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1984 while (fgets(line, sizeof line, stream)) {
1987 uint32_t dest, gateway, mask;
1988 int refcnt, metric, mtu;
1989 unsigned int flags, use, window, irtt;
1992 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1994 iface, &dest, &gateway, &flags, &refcnt,
1995 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1997 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2001 if (!(flags & RTF_UP)) {
2002 /* Skip routes that aren't up. */
2006 /* The output of 'dest', 'mask', and 'gateway' were given in
2007 * network byte order, so we don't need need any endian
2008 * conversions here. */
2009 if ((dest & mask) == (host->s_addr & mask)) {
2011 /* The host is directly reachable. */
2012 next_hop->s_addr = 0;
2014 /* To reach the host, we must go through a gateway. */
2015 next_hop->s_addr = gateway;
2017 *netdev_name = xstrdup(iface);
2028 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2029 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2030 * returns 0. Otherwise, it returns a positive errno value; in particular,
2031 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2033 netdev_linux_arp_lookup(const struct netdev *netdev,
2034 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2037 struct sockaddr_in sin;
2040 memset(&r, 0, sizeof r);
2041 sin.sin_family = AF_INET;
2042 sin.sin_addr.s_addr = ip;
2044 memcpy(&r.arp_pa, &sin, sizeof sin);
2045 r.arp_ha.sa_family = ARPHRD_ETHER;
2047 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2048 COVERAGE_INC(netdev_arp_lookup);
2049 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2051 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2052 } else if (retval != ENXIO) {
2053 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2054 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2060 nd_to_iff_flags(enum netdev_flags nd)
2063 if (nd & NETDEV_UP) {
2066 if (nd & NETDEV_PROMISC) {
2073 iff_to_nd_flags(int iff)
2075 enum netdev_flags nd = 0;
2079 if (iff & IFF_PROMISC) {
2080 nd |= NETDEV_PROMISC;
2086 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2087 enum netdev_flags on, enum netdev_flags *old_flagsp)
2089 int old_flags, new_flags;
2092 error = get_flags(netdev, &old_flags);
2094 *old_flagsp = iff_to_nd_flags(old_flags);
2095 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2096 if (new_flags != old_flags) {
2097 error = set_flags(netdev, new_flags);
2104 poll_notify(struct list *list)
2106 struct netdev_linux_notifier *notifier;
2107 LIST_FOR_EACH (notifier, node, list) {
2108 struct netdev_notifier *n = ¬ifier->notifier;
2114 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2115 void *aux OVS_UNUSED)
2118 struct list *list = shash_find_data(&netdev_linux_notifiers,
2124 struct shash_node *node;
2125 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2126 poll_notify(node->data);
2132 netdev_linux_poll_add(struct netdev *netdev,
2133 void (*cb)(struct netdev_notifier *), void *aux,
2134 struct netdev_notifier **notifierp)
2136 const char *netdev_name = netdev_get_name(netdev);
2137 struct netdev_linux_notifier *notifier;
2140 if (shash_is_empty(&netdev_linux_notifiers)) {
2142 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2143 netdev_linux_poll_cb, NULL);
2149 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2151 list = xmalloc(sizeof *list);
2153 shash_add(&netdev_linux_notifiers, netdev_name, list);
2156 notifier = xmalloc(sizeof *notifier);
2157 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2158 list_push_back(list, ¬ifier->node);
2159 *notifierp = ¬ifier->notifier;
2164 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2166 struct netdev_linux_notifier *notifier =
2167 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2170 /* Remove 'notifier' from its list. */
2171 list = list_remove(¬ifier->node);
2172 if (list_is_empty(list)) {
2173 /* The list is now empty. Remove it from the hash and free it. */
2174 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2175 shash_delete(&netdev_linux_notifiers,
2176 shash_find(&netdev_linux_notifiers, netdev_name));
2181 /* If that was the last notifier, unregister. */
2182 if (shash_is_empty(&netdev_linux_notifiers)) {
2183 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2187 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2191 netdev_linux_init, \
2193 netdev_linux_wait, \
2196 netdev_linux_destroy, \
2197 NULL, /* reconfigure */ \
2199 netdev_linux_open, \
2200 netdev_linux_close, \
2204 netdev_linux_recv, \
2205 netdev_linux_recv_wait, \
2206 netdev_linux_drain, \
2208 netdev_linux_send, \
2209 netdev_linux_send_wait, \
2211 netdev_linux_set_etheraddr, \
2212 netdev_linux_get_etheraddr, \
2213 netdev_linux_get_mtu, \
2214 netdev_linux_get_ifindex, \
2215 netdev_linux_get_carrier, \
2216 netdev_linux_get_miimon, \
2217 netdev_linux_get_stats, \
2220 netdev_linux_get_features, \
2221 netdev_linux_set_advertisements, \
2222 netdev_linux_get_vlan_vid, \
2224 netdev_linux_set_policing, \
2225 netdev_linux_get_qos_types, \
2226 netdev_linux_get_qos_capabilities, \
2227 netdev_linux_get_qos, \
2228 netdev_linux_set_qos, \
2229 netdev_linux_get_queue, \
2230 netdev_linux_set_queue, \
2231 netdev_linux_delete_queue, \
2232 netdev_linux_get_queue_stats, \
2233 netdev_linux_dump_queues, \
2234 netdev_linux_dump_queue_stats, \
2236 netdev_linux_get_in4, \
2237 netdev_linux_set_in4, \
2238 netdev_linux_get_in6, \
2239 netdev_linux_add_router, \
2240 netdev_linux_get_next_hop, \
2241 NULL, /* get_status */ \
2242 netdev_linux_arp_lookup, \
2244 netdev_linux_update_flags, \
2246 netdev_linux_poll_add, \
2247 netdev_linux_poll_remove \
2250 const struct netdev_class netdev_linux_class =
2253 netdev_linux_create,
2254 netdev_linux_enumerate,
2255 NULL); /* set_stats */
2257 const struct netdev_class netdev_tap_class =
2260 netdev_linux_create_tap,
2261 NULL, /* enumerate */
2262 NULL); /* set_stats */
2264 const struct netdev_class netdev_internal_class =
2267 netdev_linux_create,
2268 NULL, /* enumerate */
2269 netdev_vport_set_stats);
2271 /* HTB traffic control class. */
2273 #define HTB_N_QUEUES 0xf000
2277 unsigned int max_rate; /* In bytes/s. */
2281 struct tc_queue tc_queue;
2282 unsigned int min_rate; /* In bytes/s. */
2283 unsigned int max_rate; /* In bytes/s. */
2284 unsigned int burst; /* In bytes. */
2285 unsigned int priority; /* Lower values are higher priorities. */
2289 htb_get__(const struct netdev *netdev)
2291 struct netdev_dev_linux *netdev_dev =
2292 netdev_dev_linux_cast(netdev_get_dev(netdev));
2293 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2297 htb_install__(struct netdev *netdev, uint64_t max_rate)
2299 struct netdev_dev_linux *netdev_dev =
2300 netdev_dev_linux_cast(netdev_get_dev(netdev));
2303 htb = xmalloc(sizeof *htb);
2304 tc_init(&htb->tc, &tc_ops_htb);
2305 htb->max_rate = max_rate;
2307 netdev_dev->tc = &htb->tc;
2312 /* Create an HTB qdisc.
2314 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2316 htb_setup_qdisc__(struct netdev *netdev)
2319 struct tc_htb_glob opt;
2320 struct ofpbuf request;
2321 struct tcmsg *tcmsg;
2323 tc_del_qdisc(netdev);
2325 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2326 NLM_F_EXCL | NLM_F_CREATE, &request);
2330 tcmsg->tcm_handle = tc_make_handle(1, 0);
2331 tcmsg->tcm_parent = TC_H_ROOT;
2333 nl_msg_put_string(&request, TCA_KIND, "htb");
2335 memset(&opt, 0, sizeof opt);
2336 opt.rate2quantum = 10;
2340 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2341 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2342 nl_msg_end_nested(&request, opt_offset);
2344 return tc_transact(&request, NULL);
2347 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2348 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2350 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2351 unsigned int parent, struct htb_class *class)
2354 struct tc_htb_opt opt;
2355 struct ofpbuf request;
2356 struct tcmsg *tcmsg;
2360 netdev_get_mtu(netdev, &mtu);
2362 memset(&opt, 0, sizeof opt);
2363 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2364 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2365 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2366 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2367 opt.prio = class->priority;
2369 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2373 tcmsg->tcm_handle = handle;
2374 tcmsg->tcm_parent = parent;
2376 nl_msg_put_string(&request, TCA_KIND, "htb");
2377 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2378 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2379 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2380 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2381 nl_msg_end_nested(&request, opt_offset);
2383 error = tc_transact(&request, NULL);
2385 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2386 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2387 netdev_get_name(netdev),
2388 tc_get_major(handle), tc_get_minor(handle),
2389 tc_get_major(parent), tc_get_minor(parent),
2390 class->min_rate, class->max_rate,
2391 class->burst, class->priority, strerror(error));
2396 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2397 * description of them into 'details'. The description complies with the
2398 * specification given in the vswitch database documentation for linux-htb
2401 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2403 static const struct nl_policy tca_htb_policy[] = {
2404 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2405 .min_len = sizeof(struct tc_htb_opt) },
2408 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2409 const struct tc_htb_opt *htb;
2411 if (!nl_parse_nested(nl_options, tca_htb_policy,
2412 attrs, ARRAY_SIZE(tca_htb_policy))) {
2413 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2417 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2418 class->min_rate = htb->rate.rate;
2419 class->max_rate = htb->ceil.rate;
2420 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2421 class->priority = htb->prio;
2426 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2427 struct htb_class *options,
2428 struct netdev_queue_stats *stats)
2430 struct nlattr *nl_options;
2431 unsigned int handle;
2434 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2435 if (!error && queue_id) {
2436 unsigned int major = tc_get_major(handle);
2437 unsigned int minor = tc_get_minor(handle);
2438 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2439 *queue_id = minor - 1;
2444 if (!error && options) {
2445 error = htb_parse_tca_options__(nl_options, options);
2451 htb_parse_qdisc_details__(struct netdev *netdev,
2452 const struct shash *details, struct htb_class *hc)
2454 const char *max_rate_s;
2456 max_rate_s = shash_find_data(details, "max-rate");
2457 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2458 if (!hc->max_rate) {
2461 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2462 hc->max_rate = netdev_features_to_bps(current) / 8;
2464 hc->min_rate = hc->max_rate;
2470 htb_parse_class_details__(struct netdev *netdev,
2471 const struct shash *details, struct htb_class *hc)
2473 const struct htb *htb = htb_get__(netdev);
2474 const char *min_rate_s = shash_find_data(details, "min-rate");
2475 const char *max_rate_s = shash_find_data(details, "max-rate");
2476 const char *burst_s = shash_find_data(details, "burst");
2477 const char *priority_s = shash_find_data(details, "priority");
2480 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2482 /* min-rate is required. */
2485 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2486 hc->min_rate = MAX(hc->min_rate, 1500);
2487 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2490 hc->max_rate = (max_rate_s
2491 ? strtoull(max_rate_s, NULL, 10) / 8
2493 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2494 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2498 * According to hints in the documentation that I've read, it is important
2499 * that 'burst' be at least as big as the largest frame that might be
2500 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2501 * but having it a bit too small is a problem. Since netdev_get_mtu()
2502 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2503 * the MTU. We actually add 64, instead of 14, as a guard against
2504 * additional headers get tacked on somewhere that we're not aware of. */
2505 netdev_get_mtu(netdev, &mtu);
2506 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2507 hc->burst = MAX(hc->burst, mtu + 64);
2510 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2516 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2517 unsigned int parent, struct htb_class *options,
2518 struct netdev_queue_stats *stats)
2520 struct ofpbuf *reply;
2523 error = tc_query_class(netdev, handle, parent, &reply);
2525 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2526 ofpbuf_delete(reply);
2532 htb_tc_install(struct netdev *netdev, const struct shash *details)
2536 error = htb_setup_qdisc__(netdev);
2538 struct htb_class hc;
2540 htb_parse_qdisc_details__(netdev, details, &hc);
2541 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2542 tc_make_handle(1, 0), &hc);
2544 htb_install__(netdev, hc.max_rate);
2550 static struct htb_class *
2551 htb_class_cast__(const struct tc_queue *queue)
2553 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2557 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2558 const struct htb_class *hc)
2560 struct htb *htb = htb_get__(netdev);
2561 size_t hash = hash_int(queue_id, 0);
2562 struct tc_queue *queue;
2563 struct htb_class *hcp;
2565 queue = tc_find_queue__(netdev, queue_id, hash);
2567 hcp = htb_class_cast__(queue);
2569 hcp = xmalloc(sizeof *hcp);
2570 queue = &hcp->tc_queue;
2571 queue->queue_id = queue_id;
2572 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2575 hcp->min_rate = hc->min_rate;
2576 hcp->max_rate = hc->max_rate;
2577 hcp->burst = hc->burst;
2578 hcp->priority = hc->priority;
2582 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2585 struct nl_dump dump;
2586 struct htb_class hc;
2589 /* Get qdisc options. */
2591 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2592 htb = htb_install__(netdev, hc.max_rate);
2595 if (!start_queue_dump(netdev, &dump)) {
2598 while (nl_dump_next(&dump, &msg)) {
2599 unsigned int queue_id;
2601 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2602 htb_update_queue__(netdev, queue_id, &hc);
2605 nl_dump_done(&dump);
2611 htb_tc_destroy(struct tc *tc)
2613 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2614 struct htb_class *hc, *next;
2616 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2617 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2625 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2627 const struct htb *htb = htb_get__(netdev);
2628 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2633 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2635 struct htb_class hc;
2638 htb_parse_qdisc_details__(netdev, details, &hc);
2639 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2640 tc_make_handle(1, 0), &hc);
2642 htb_get__(netdev)->max_rate = hc.max_rate;
2648 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2649 const struct tc_queue *queue, struct shash *details)
2651 const struct htb_class *hc = htb_class_cast__(queue);
2653 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2654 if (hc->min_rate != hc->max_rate) {
2655 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2657 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2659 shash_add(details, "priority", xasprintf("%u", hc->priority));
2665 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2666 const struct shash *details)
2668 struct htb_class hc;
2671 error = htb_parse_class_details__(netdev, details, &hc);
2676 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2677 tc_make_handle(1, 0xfffe), &hc);
2682 htb_update_queue__(netdev, queue_id, &hc);
2687 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2689 struct htb_class *hc = htb_class_cast__(queue);
2690 struct htb *htb = htb_get__(netdev);
2693 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2695 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2702 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2703 struct netdev_queue_stats *stats)
2705 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2706 tc_make_handle(1, 0xfffe), NULL, stats);
2710 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2711 const struct ofpbuf *nlmsg,
2712 netdev_dump_queue_stats_cb *cb, void *aux)
2714 struct netdev_queue_stats stats;
2715 unsigned int handle, major, minor;
2718 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2723 major = tc_get_major(handle);
2724 minor = tc_get_minor(handle);
2725 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2726 (*cb)(minor - 1, &stats, aux);
2731 static const struct tc_ops tc_ops_htb = {
2732 "htb", /* linux_name */
2733 "linux-htb", /* ovs_name */
2734 HTB_N_QUEUES, /* n_queues */
2743 htb_class_get_stats,
2744 htb_class_dump_stats
2747 /* "linux-hfsc" traffic control class. */
2749 #define HFSC_N_QUEUES 0xf000
2757 struct tc_queue tc_queue;
2762 static struct hfsc *
2763 hfsc_get__(const struct netdev *netdev)
2765 struct netdev_dev_linux *netdev_dev;
2766 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2767 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2770 static struct hfsc_class *
2771 hfsc_class_cast__(const struct tc_queue *queue)
2773 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2776 static struct hfsc *
2777 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2779 struct netdev_dev_linux * netdev_dev;
2782 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2783 hfsc = xmalloc(sizeof *hfsc);
2784 tc_init(&hfsc->tc, &tc_ops_hfsc);
2785 hfsc->max_rate = max_rate;
2786 netdev_dev->tc = &hfsc->tc;
2792 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2793 const struct hfsc_class *hc)
2797 struct hfsc_class *hcp;
2798 struct tc_queue *queue;
2800 hfsc = hfsc_get__(netdev);
2801 hash = hash_int(queue_id, 0);
2803 queue = tc_find_queue__(netdev, queue_id, hash);
2805 hcp = hfsc_class_cast__(queue);
2807 hcp = xmalloc(sizeof *hcp);
2808 queue = &hcp->tc_queue;
2809 queue->queue_id = queue_id;
2810 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2813 hcp->min_rate = hc->min_rate;
2814 hcp->max_rate = hc->max_rate;
2818 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2820 const struct tc_service_curve *rsc, *fsc, *usc;
2821 static const struct nl_policy tca_hfsc_policy[] = {
2823 .type = NL_A_UNSPEC,
2825 .min_len = sizeof(struct tc_service_curve),
2828 .type = NL_A_UNSPEC,
2830 .min_len = sizeof(struct tc_service_curve),
2833 .type = NL_A_UNSPEC,
2835 .min_len = sizeof(struct tc_service_curve),
2838 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2840 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2841 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2842 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2846 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2847 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2848 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2850 if (rsc->m1 != 0 || rsc->d != 0 ||
2851 fsc->m1 != 0 || fsc->d != 0 ||
2852 usc->m1 != 0 || usc->d != 0) {
2853 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2854 "Non-linear service curves are not supported.");
2858 if (rsc->m2 != fsc->m2) {
2859 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2860 "Real-time service curves are not supported ");
2864 if (rsc->m2 > usc->m2) {
2865 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2866 "Min-rate service curve is greater than "
2867 "the max-rate service curve.");
2871 class->min_rate = fsc->m2;
2872 class->max_rate = usc->m2;
2877 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2878 struct hfsc_class *options,
2879 struct netdev_queue_stats *stats)
2882 unsigned int handle;
2883 struct nlattr *nl_options;
2885 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2891 unsigned int major, minor;
2893 major = tc_get_major(handle);
2894 minor = tc_get_minor(handle);
2895 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2896 *queue_id = minor - 1;
2903 error = hfsc_parse_tca_options__(nl_options, options);
2910 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2911 unsigned int parent, struct hfsc_class *options,
2912 struct netdev_queue_stats *stats)
2915 struct ofpbuf *reply;
2917 error = tc_query_class(netdev, handle, parent, &reply);
2922 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2923 ofpbuf_delete(reply);
2928 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2929 struct hfsc_class *class)
2932 const char *max_rate_s;
2934 max_rate_s = shash_find_data(details, "max-rate");
2935 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2940 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2941 max_rate = netdev_features_to_bps(current) / 8;
2944 class->min_rate = max_rate;
2945 class->max_rate = max_rate;
2949 hfsc_parse_class_details__(struct netdev *netdev,
2950 const struct shash *details,
2951 struct hfsc_class * class)
2953 const struct hfsc *hfsc;
2954 uint32_t min_rate, max_rate;
2955 const char *min_rate_s, *max_rate_s;
2957 hfsc = hfsc_get__(netdev);
2958 min_rate_s = shash_find_data(details, "min-rate");
2959 max_rate_s = shash_find_data(details, "max-rate");
2965 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2966 min_rate = MAX(min_rate, 1500);
2967 min_rate = MIN(min_rate, hfsc->max_rate);
2969 max_rate = (max_rate_s
2970 ? strtoull(max_rate_s, NULL, 10) / 8
2972 max_rate = MAX(max_rate, min_rate);
2973 max_rate = MIN(max_rate, hfsc->max_rate);
2975 class->min_rate = min_rate;
2976 class->max_rate = max_rate;
2981 /* Create an HFSC qdisc.
2983 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2985 hfsc_setup_qdisc__(struct netdev * netdev)
2987 struct tcmsg *tcmsg;
2988 struct ofpbuf request;
2989 struct tc_hfsc_qopt opt;
2991 tc_del_qdisc(netdev);
2993 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2994 NLM_F_EXCL | NLM_F_CREATE, &request);
3000 tcmsg->tcm_handle = tc_make_handle(1, 0);
3001 tcmsg->tcm_parent = TC_H_ROOT;
3003 memset(&opt, 0, sizeof opt);
3006 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3007 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3009 return tc_transact(&request, NULL);
3012 /* Create an HFSC class.
3014 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3015 * sc rate <min_rate> ul rate <max_rate>" */
3017 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3018 unsigned int parent, struct hfsc_class *class)
3022 struct tcmsg *tcmsg;
3023 struct ofpbuf request;
3024 struct tc_service_curve min, max;
3026 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3032 tcmsg->tcm_handle = handle;
3033 tcmsg->tcm_parent = parent;
3037 min.m2 = class->min_rate;
3041 max.m2 = class->max_rate;
3043 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3044 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3045 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3046 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3047 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3048 nl_msg_end_nested(&request, opt_offset);
3050 error = tc_transact(&request, NULL);
3052 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3053 "min-rate %ubps, max-rate %ubps (%s)",
3054 netdev_get_name(netdev),
3055 tc_get_major(handle), tc_get_minor(handle),
3056 tc_get_major(parent), tc_get_minor(parent),
3057 class->min_rate, class->max_rate, strerror(error));
3064 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3067 struct hfsc_class class;
3069 error = hfsc_setup_qdisc__(netdev);
3075 hfsc_parse_qdisc_details__(netdev, details, &class);
3076 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3077 tc_make_handle(1, 0), &class);
3083 hfsc_install__(netdev, class.max_rate);
3088 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3092 struct nl_dump dump;
3093 struct hfsc_class hc;
3096 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3097 hfsc = hfsc_install__(netdev, hc.max_rate);
3099 if (!start_queue_dump(netdev, &dump)) {
3103 while (nl_dump_next(&dump, &msg)) {
3104 unsigned int queue_id;
3106 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3107 hfsc_update_queue__(netdev, queue_id, &hc);
3111 nl_dump_done(&dump);
3116 hfsc_tc_destroy(struct tc *tc)
3119 struct hfsc_class *hc, *next;
3121 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3123 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3124 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3133 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3135 const struct hfsc *hfsc;
3136 hfsc = hfsc_get__(netdev);
3137 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3142 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3145 struct hfsc_class class;
3147 hfsc_parse_qdisc_details__(netdev, details, &class);
3148 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3149 tc_make_handle(1, 0), &class);
3152 hfsc_get__(netdev)->max_rate = class.max_rate;
3159 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3160 const struct tc_queue *queue, struct shash *details)
3162 const struct hfsc_class *hc;
3164 hc = hfsc_class_cast__(queue);
3165 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3166 if (hc->min_rate != hc->max_rate) {
3167 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3173 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3174 const struct shash *details)
3177 struct hfsc_class class;
3179 error = hfsc_parse_class_details__(netdev, details, &class);
3184 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3185 tc_make_handle(1, 0xfffe), &class);
3190 hfsc_update_queue__(netdev, queue_id, &class);
3195 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3199 struct hfsc_class *hc;
3201 hc = hfsc_class_cast__(queue);
3202 hfsc = hfsc_get__(netdev);
3204 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3206 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3213 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3214 struct netdev_queue_stats *stats)
3216 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3217 tc_make_handle(1, 0xfffe), NULL, stats);
3221 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3222 const struct ofpbuf *nlmsg,
3223 netdev_dump_queue_stats_cb *cb, void *aux)
3225 struct netdev_queue_stats stats;
3226 unsigned int handle, major, minor;
3229 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3234 major = tc_get_major(handle);
3235 minor = tc_get_minor(handle);
3236 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3237 (*cb)(minor - 1, &stats, aux);
3242 static const struct tc_ops tc_ops_hfsc = {
3243 "hfsc", /* linux_name */
3244 "linux-hfsc", /* ovs_name */
3245 HFSC_N_QUEUES, /* n_queues */
3246 hfsc_tc_install, /* tc_install */
3247 hfsc_tc_load, /* tc_load */
3248 hfsc_tc_destroy, /* tc_destroy */
3249 hfsc_qdisc_get, /* qdisc_get */
3250 hfsc_qdisc_set, /* qdisc_set */
3251 hfsc_class_get, /* class_get */
3252 hfsc_class_set, /* class_set */
3253 hfsc_class_delete, /* class_delete */
3254 hfsc_class_get_stats, /* class_get_stats */
3255 hfsc_class_dump_stats /* class_dump_stats */
3258 /* "linux-default" traffic control class.
3260 * This class represents the default, unnamed Linux qdisc. It corresponds to
3261 * the "" (empty string) QoS type in the OVS database. */
3264 default_install__(struct netdev *netdev)
3266 struct netdev_dev_linux *netdev_dev =
3267 netdev_dev_linux_cast(netdev_get_dev(netdev));
3268 static struct tc *tc;
3271 tc = xmalloc(sizeof *tc);
3272 tc_init(tc, &tc_ops_default);
3274 netdev_dev->tc = tc;
3278 default_tc_install(struct netdev *netdev,
3279 const struct shash *details OVS_UNUSED)
3281 default_install__(netdev);
3286 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3288 default_install__(netdev);
3292 static const struct tc_ops tc_ops_default = {
3293 NULL, /* linux_name */
3298 NULL, /* tc_destroy */
3299 NULL, /* qdisc_get */
3300 NULL, /* qdisc_set */
3301 NULL, /* class_get */
3302 NULL, /* class_set */
3303 NULL, /* class_delete */
3304 NULL, /* class_get_stats */
3305 NULL /* class_dump_stats */
3308 /* "linux-other" traffic control class.
3313 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3315 struct netdev_dev_linux *netdev_dev =
3316 netdev_dev_linux_cast(netdev_get_dev(netdev));
3317 static struct tc *tc;
3320 tc = xmalloc(sizeof *tc);
3321 tc_init(tc, &tc_ops_other);
3323 netdev_dev->tc = tc;
3327 static const struct tc_ops tc_ops_other = {
3328 NULL, /* linux_name */
3329 "linux-other", /* ovs_name */
3331 NULL, /* tc_install */
3333 NULL, /* tc_destroy */
3334 NULL, /* qdisc_get */
3335 NULL, /* qdisc_set */
3336 NULL, /* class_get */
3337 NULL, /* class_set */
3338 NULL, /* class_delete */
3339 NULL, /* class_get_stats */
3340 NULL /* class_dump_stats */
3343 /* Traffic control. */
3345 /* Number of kernel "tc" ticks per second. */
3346 static double ticks_per_s;
3348 /* Number of kernel "jiffies" per second. This is used for the purpose of
3349 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3350 * one jiffy's worth of data.
3352 * There are two possibilities here:
3354 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3355 * approximate range of 100 to 1024. That means that we really need to
3356 * make sure that the qdisc can buffer that much data.
3358 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3359 * has finely granular timers and there's no need to fudge additional room
3360 * for buffers. (There's no extra effort needed to implement that: the
3361 * large 'buffer_hz' is used as a divisor, so practically any number will
3362 * come out as 0 in the division. Small integer results in the case of
3363 * really high dividends won't have any real effect anyhow.)
3365 static unsigned int buffer_hz;
3367 /* Returns tc handle 'major':'minor'. */
3369 tc_make_handle(unsigned int major, unsigned int minor)
3371 return TC_H_MAKE(major << 16, minor);
3374 /* Returns the major number from 'handle'. */
3376 tc_get_major(unsigned int handle)
3378 return TC_H_MAJ(handle) >> 16;
3381 /* Returns the minor number from 'handle'. */
3383 tc_get_minor(unsigned int handle)
3385 return TC_H_MIN(handle);
3388 static struct tcmsg *
3389 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3390 struct ofpbuf *request)
3392 struct tcmsg *tcmsg;
3396 error = get_ifindex(netdev, &ifindex);
3401 ofpbuf_init(request, 512);
3402 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3403 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3404 tcmsg->tcm_family = AF_UNSPEC;
3405 tcmsg->tcm_ifindex = ifindex;
3406 /* Caller should fill in tcmsg->tcm_handle. */
3407 /* Caller should fill in tcmsg->tcm_parent. */
3413 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3415 int error = nl_sock_transact(rtnl_sock, request, replyp);
3416 ofpbuf_uninit(request);
3423 /* The values in psched are not individually very meaningful, but they are
3424 * important. The tables below show some values seen in the wild.
3428 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3429 * (Before that, there are hints that it was 1000000000.)
3431 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3435 * -----------------------------------
3436 * [1] 000c8000 000f4240 000f4240 00000064
3437 * [2] 000003e8 00000400 000f4240 3b9aca00
3438 * [3] 000003e8 00000400 000f4240 3b9aca00
3439 * [4] 000003e8 00000400 000f4240 00000064
3440 * [5] 000003e8 00000040 000f4240 3b9aca00
3441 * [6] 000003e8 00000040 000f4240 000000f9
3443 * a b c d ticks_per_s buffer_hz
3444 * ------- --------- ---------- ------------- ----------- -------------
3445 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3446 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3447 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3448 * [4] 1,000 1,024 1,000,000 100 976,562 100
3449 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3450 * [6] 1,000 64 1,000,000 249 15,625,000 249
3452 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3453 * [2] 2.6.26-1-686-bigmem from Debian lenny
3454 * [3] 2.6.26-2-sparc64 from Debian lenny
3455 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3456 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3457 * [6] 2.6.34 from kernel.org on KVM
3459 static const char fn[] = "/proc/net/psched";
3460 unsigned int a, b, c, d;
3466 stream = fopen(fn, "r");
3468 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3472 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3473 VLOG_WARN("%s: read failed", fn);
3477 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3481 VLOG_WARN("%s: invalid scheduler parameters", fn);
3485 ticks_per_s = (double) a * c / b;
3489 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3492 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3495 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3496 * rate of 'rate' bytes per second. */
3498 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3503 return (rate * ticks) / ticks_per_s;
3506 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3507 * rate of 'rate' bytes per second. */
3509 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3514 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3517 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3518 * a transmission rate of 'rate' bytes per second. */
3520 tc_buffer_per_jiffy(unsigned int rate)
3525 return rate / buffer_hz;
3528 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3529 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3530 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3531 * stores NULL into it if it is absent.
3533 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3536 * Returns 0 if successful, otherwise a positive errno value. */
3538 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3539 struct nlattr **options)
3541 static const struct nl_policy tca_policy[] = {
3542 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3543 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3545 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3547 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3548 tca_policy, ta, ARRAY_SIZE(ta))) {
3549 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3554 *kind = nl_attr_get_string(ta[TCA_KIND]);
3558 *options = ta[TCA_OPTIONS];
3573 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3574 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3575 * into '*options', and its queue statistics into '*stats'. Any of the output
3576 * arguments may be null.
3578 * Returns 0 if successful, otherwise a positive errno value. */
3580 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3581 struct nlattr **options, struct netdev_queue_stats *stats)
3583 static const struct nl_policy tca_policy[] = {
3584 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3585 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3587 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3589 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3590 tca_policy, ta, ARRAY_SIZE(ta))) {
3591 VLOG_WARN_RL(&rl, "failed to parse class message");
3596 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3597 *handlep = tc->tcm_handle;
3601 *options = ta[TCA_OPTIONS];
3605 const struct gnet_stats_queue *gsq;
3606 struct gnet_stats_basic gsb;
3608 static const struct nl_policy stats_policy[] = {
3609 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3610 .min_len = sizeof gsb },
3611 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3612 .min_len = sizeof *gsq },
3614 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3616 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3617 sa, ARRAY_SIZE(sa))) {
3618 VLOG_WARN_RL(&rl, "failed to parse class stats");
3622 /* Alignment issues screw up the length of struct gnet_stats_basic on
3623 * some arch/bitsize combinations. Newer versions of Linux have a
3624 * struct gnet_stats_basic_packed, but we can't depend on that. The
3625 * easiest thing to do is just to make a copy. */
3626 memset(&gsb, 0, sizeof gsb);
3627 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3628 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3629 stats->tx_bytes = gsb.bytes;
3630 stats->tx_packets = gsb.packets;
3632 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3633 stats->tx_errors = gsq->drops;
3643 memset(stats, 0, sizeof *stats);
3648 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3651 tc_query_class(const struct netdev *netdev,
3652 unsigned int handle, unsigned int parent,
3653 struct ofpbuf **replyp)
3655 struct ofpbuf request;
3656 struct tcmsg *tcmsg;
3659 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3663 tcmsg->tcm_handle = handle;
3664 tcmsg->tcm_parent = parent;
3666 error = tc_transact(&request, replyp);
3668 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3669 netdev_get_name(netdev),
3670 tc_get_major(handle), tc_get_minor(handle),
3671 tc_get_major(parent), tc_get_minor(parent),
3677 /* Equivalent to "tc class del dev <name> handle <handle>". */
3679 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3681 struct ofpbuf request;
3682 struct tcmsg *tcmsg;
3685 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3689 tcmsg->tcm_handle = handle;
3690 tcmsg->tcm_parent = 0;
3692 error = tc_transact(&request, NULL);
3694 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3695 netdev_get_name(netdev),
3696 tc_get_major(handle), tc_get_minor(handle),
3702 /* Equivalent to "tc qdisc del dev <name> root". */
3704 tc_del_qdisc(struct netdev *netdev)
3706 struct netdev_dev_linux *netdev_dev =
3707 netdev_dev_linux_cast(netdev_get_dev(netdev));
3708 struct ofpbuf request;
3709 struct tcmsg *tcmsg;
3712 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3716 tcmsg->tcm_handle = tc_make_handle(1, 0);
3717 tcmsg->tcm_parent = TC_H_ROOT;
3719 error = tc_transact(&request, NULL);
3720 if (error == EINVAL) {
3721 /* EINVAL probably means that the default qdisc was in use, in which
3722 * case we've accomplished our purpose. */
3725 if (!error && netdev_dev->tc) {
3726 if (netdev_dev->tc->ops->tc_destroy) {
3727 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3729 netdev_dev->tc = NULL;
3734 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3735 * kernel to determine what they are. Returns 0 if successful, otherwise a
3736 * positive errno value. */
3738 tc_query_qdisc(const struct netdev *netdev)
3740 struct netdev_dev_linux *netdev_dev =
3741 netdev_dev_linux_cast(netdev_get_dev(netdev));
3742 struct ofpbuf request, *qdisc;
3743 const struct tc_ops *ops;
3744 struct tcmsg *tcmsg;
3748 if (netdev_dev->tc) {
3752 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3753 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3754 * 2.6.35 without that fix backported to it.
3756 * To avoid the OOPS, we must not make a request that would attempt to dump
3757 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3758 * few others. There are a few ways that I can see to do this, but most of
3759 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3760 * technique chosen here is to assume that any non-default qdisc that we
3761 * create will have a class with handle 1:0. The built-in qdiscs only have
3762 * a class with handle 0:0.
3764 * We could check for Linux 2.6.35+ and use a more straightforward method
3766 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3770 tcmsg->tcm_handle = tc_make_handle(1, 0);
3771 tcmsg->tcm_parent = 0;
3773 /* Figure out what tc class to instantiate. */
3774 error = tc_transact(&request, &qdisc);
3778 error = tc_parse_qdisc(qdisc, &kind, NULL);
3780 ops = &tc_ops_other;
3782 ops = tc_lookup_linux_name(kind);
3784 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3785 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3787 ops = &tc_ops_other;
3790 } else if (error == ENOENT) {
3791 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3792 * other entity that doesn't have a handle 1:0. We will assume
3793 * that it's the system default qdisc. */
3794 ops = &tc_ops_default;
3797 /* Who knows? Maybe the device got deleted. */
3798 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3799 netdev_get_name(netdev), strerror(error));
3800 ops = &tc_ops_other;
3803 /* Instantiate it. */
3804 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3805 assert((load_error == 0) == (netdev_dev->tc != NULL));
3806 ofpbuf_delete(qdisc);
3808 return error ? error : load_error;
3811 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3812 approximate the time to transmit packets of various lengths. For an MTU of
3813 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3814 represents two possible packet lengths; for a MTU of 513 through 1024, four
3815 possible lengths; and so on.
3817 Returns, for the specified 'mtu', the number of bits that packet lengths
3818 need to be shifted right to fit within such a 256-entry table. */
3820 tc_calc_cell_log(unsigned int mtu)
3825 mtu = ETH_PAYLOAD_MAX;
3827 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3829 for (cell_log = 0; mtu >= 256; cell_log++) {
3836 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3839 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3841 memset(rate, 0, sizeof *rate);
3842 rate->cell_log = tc_calc_cell_log(mtu);
3843 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3844 /* rate->cell_align = 0; */ /* distro headers. */
3845 rate->mpu = ETH_TOTAL_MIN;
3849 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3850 * attribute of the specified "type".
3852 * See tc_calc_cell_log() above for a description of "rtab"s. */
3854 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3859 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3860 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3861 unsigned packet_size = (i + 1) << rate->cell_log;
3862 if (packet_size < rate->mpu) {
3863 packet_size = rate->mpu;
3865 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3869 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3870 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3871 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3874 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3876 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3877 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3881 /* Utility functions. */
3884 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3886 /* Policy for RTNLGRP_LINK messages.
3888 * There are *many* more fields in these messages, but currently we only
3889 * care about these fields. */
3890 static const struct nl_policy rtnlgrp_link_policy[] = {
3891 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3892 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3893 .min_len = sizeof(struct rtnl_link_stats) },
3896 struct ofpbuf request;
3897 struct ofpbuf *reply;
3898 struct ifinfomsg *ifi;
3899 const struct rtnl_link_stats *rtnl_stats;
3900 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3903 ofpbuf_init(&request, 0);
3904 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3905 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3906 ifi->ifi_family = PF_UNSPEC;
3907 ifi->ifi_index = ifindex;
3908 error = nl_sock_transact(rtnl_sock, &request, &reply);
3909 ofpbuf_uninit(&request);
3914 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3915 rtnlgrp_link_policy,
3916 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3917 ofpbuf_delete(reply);
3921 if (!attrs[IFLA_STATS]) {
3922 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3923 ofpbuf_delete(reply);
3927 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3928 stats->rx_packets = rtnl_stats->rx_packets;
3929 stats->tx_packets = rtnl_stats->tx_packets;
3930 stats->rx_bytes = rtnl_stats->rx_bytes;
3931 stats->tx_bytes = rtnl_stats->tx_bytes;
3932 stats->rx_errors = rtnl_stats->rx_errors;
3933 stats->tx_errors = rtnl_stats->tx_errors;
3934 stats->rx_dropped = rtnl_stats->rx_dropped;
3935 stats->tx_dropped = rtnl_stats->tx_dropped;
3936 stats->multicast = rtnl_stats->multicast;
3937 stats->collisions = rtnl_stats->collisions;
3938 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3939 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3940 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3941 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3942 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3943 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3944 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3945 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3946 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3947 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3948 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3950 ofpbuf_delete(reply);
3956 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3958 static const char fn[] = "/proc/net/dev";
3963 stream = fopen(fn, "r");
3965 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3970 while (fgets(line, sizeof line, stream)) {
3973 #define X64 "%"SCNu64
3976 X64 X64 X64 X64 X64 X64 X64 "%*u"
3977 X64 X64 X64 X64 X64 X64 X64 "%*u",
3983 &stats->rx_fifo_errors,
3984 &stats->rx_frame_errors,
3990 &stats->tx_fifo_errors,
3992 &stats->tx_carrier_errors) != 15) {
3993 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3994 } else if (!strcmp(devname, netdev_name)) {
3995 stats->rx_length_errors = UINT64_MAX;
3996 stats->rx_over_errors = UINT64_MAX;
3997 stats->rx_crc_errors = UINT64_MAX;
3998 stats->rx_missed_errors = UINT64_MAX;
3999 stats->tx_aborted_errors = UINT64_MAX;
4000 stats->tx_heartbeat_errors = UINT64_MAX;
4001 stats->tx_window_errors = UINT64_MAX;
4007 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4013 get_flags(const struct netdev *netdev, int *flags)
4018 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4020 *flags = ifr.ifr_flags;
4025 set_flags(struct netdev *netdev, int flags)
4029 ifr.ifr_flags = flags;
4030 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4035 do_get_ifindex(const char *netdev_name)
4039 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4040 COVERAGE_INC(netdev_get_ifindex);
4041 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4042 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4043 netdev_name, strerror(errno));
4046 return ifr.ifr_ifindex;
4050 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4052 struct netdev_dev_linux *netdev_dev =
4053 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4055 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4056 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4060 netdev_dev->cache_valid |= VALID_IFINDEX;
4061 netdev_dev->ifindex = ifindex;
4063 *ifindexp = netdev_dev->ifindex;
4068 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4073 memset(&ifr, 0, sizeof ifr);
4074 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4075 COVERAGE_INC(netdev_get_hwaddr);
4076 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4077 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4078 netdev_name, strerror(errno));
4081 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4082 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4083 VLOG_WARN("%s device has unknown hardware address family %d",
4084 netdev_name, hwaddr_family);
4086 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4091 set_etheraddr(const char *netdev_name, int hwaddr_family,
4092 const uint8_t mac[ETH_ADDR_LEN])
4096 memset(&ifr, 0, sizeof ifr);
4097 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4098 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4099 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4100 COVERAGE_INC(netdev_set_hwaddr);
4101 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4102 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4103 netdev_name, strerror(errno));
4110 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4111 int cmd, const char *cmd_name)
4115 memset(&ifr, 0, sizeof ifr);
4116 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4117 ifr.ifr_data = (caddr_t) ecmd;
4120 COVERAGE_INC(netdev_ethtool);
4121 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4124 if (errno != EOPNOTSUPP) {
4125 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4126 "failed: %s", cmd_name, name, strerror(errno));
4128 /* The device doesn't support this operation. That's pretty
4129 * common, so there's no point in logging anything. */
4136 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4137 const char *cmd_name)
4139 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4140 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4141 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4149 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4150 int cmd, const char *cmd_name)
4155 ifr.ifr_addr.sa_family = AF_INET;
4156 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4158 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4159 *ip = sin->sin_addr;