2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
56 #include "netlink-socket.h"
58 #include "openflow/openflow.h"
60 #include "poll-loop.h"
61 #include "rtnetlink.h"
62 #include "socket-util.h"
67 VLOG_DEFINE_THIS_MODULE(netdev_linux);
69 COVERAGE_DEFINE(netdev_get_vlan_vid);
70 COVERAGE_DEFINE(netdev_set_policing);
71 COVERAGE_DEFINE(netdev_arp_lookup);
72 COVERAGE_DEFINE(netdev_get_ifindex);
73 COVERAGE_DEFINE(netdev_get_hwaddr);
74 COVERAGE_DEFINE(netdev_set_hwaddr);
75 COVERAGE_DEFINE(netdev_ethtool);
77 /* These were introduced in Linux 2.6.14, so they might be missing if we have
79 #ifndef ADVERTISED_Pause
80 #define ADVERTISED_Pause (1 << 13)
82 #ifndef ADVERTISED_Asym_Pause
83 #define ADVERTISED_Asym_Pause (1 << 14)
86 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
89 #define TC_RTAB_SIZE 1024
92 static struct rtnetlink_notifier netdev_linux_cache_notifier;
93 static int cache_notifier_refcount;
96 VALID_IFINDEX = 1 << 0,
97 VALID_ETHERADDR = 1 << 1,
101 VALID_CARRIER = 1 << 5,
102 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
103 VALID_POLICING = 1 << 7,
104 VALID_HAVE_VPORT_STATS = 1 << 8
112 /* Traffic control. */
114 /* An instance of a traffic control class. Always associated with a particular
117 * Each TC implementation subclasses this with whatever additional data it
120 const struct tc_ops *ops;
121 struct hmap queues; /* Contains "struct tc_queue"s.
122 * Read by generic TC layer.
123 * Written only by TC implementation. */
126 /* One traffic control queue.
128 * Each TC implementation subclasses this with whatever additional data it
131 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
132 unsigned int queue_id; /* OpenFlow queue ID. */
135 /* A particular kind of traffic control. Each implementation generally maps to
136 * one particular Linux qdisc class.
138 * The functions below return 0 if successful or a positive errno value on
139 * failure, except where otherwise noted. All of them must be provided, except
140 * where otherwise noted. */
142 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
143 * This is null for tc_ops_default and tc_ops_other, for which there are no
144 * appropriate values. */
145 const char *linux_name;
147 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
148 const char *ovs_name;
150 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
151 * queues. The queues are numbered 0 through n_queues - 1. */
152 unsigned int n_queues;
154 /* Called to install this TC class on 'netdev'. The implementation should
155 * make the Netlink calls required to set up 'netdev' with the right qdisc
156 * and configure it according to 'details'. The implementation may assume
157 * that the current qdisc is the default; that is, there is no need for it
158 * to delete the current qdisc before installing itself.
160 * The contents of 'details' should be documented as valid for 'ovs_name'
161 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
162 * (which is built as ovs-vswitchd.conf.db(8)).
164 * This function must return 0 if and only if it sets 'netdev->tc' to an
165 * initialized 'struct tc'.
167 * (This function is null for tc_ops_other, which cannot be installed. For
168 * other TC classes it should always be nonnull.) */
169 int (*tc_install)(struct netdev *netdev, const struct shash *details);
171 /* Called when the netdev code determines (through a Netlink query) that
172 * this TC class's qdisc is installed on 'netdev', but we didn't install
173 * it ourselves and so don't know any of the details.
175 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
176 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
177 * implementation should parse the other attributes of 'nlmsg' as
178 * necessary to determine its configuration. If necessary it should also
179 * use Netlink queries to determine the configuration of queues on
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'. */
184 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
186 /* Destroys the data structures allocated by the implementation as part of
187 * 'tc'. (This includes destroying 'tc->queues' by calling
190 * The implementation should not need to perform any Netlink calls. If
191 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
192 * (But it may not be desirable.)
194 * This function may be null if 'tc' is trivial. */
195 void (*tc_destroy)(struct tc *tc);
197 /* Retrieves details of 'netdev->tc' configuration into 'details'.
199 * The implementation should not need to perform any Netlink calls, because
200 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
201 * cached the configuration.
203 * The contents of 'details' should be documented as valid for 'ovs_name'
204 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
205 * (which is built as ovs-vswitchd.conf.db(8)).
207 * This function may be null if 'tc' is not configurable.
209 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
211 /* Reconfigures 'netdev->tc' according to 'details', performing any
212 * required Netlink calls to complete the reconfiguration.
214 * The contents of 'details' should be documented as valid for 'ovs_name'
215 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
216 * (which is built as ovs-vswitchd.conf.db(8)).
218 * This function may be null if 'tc' is not configurable.
220 int (*qdisc_set)(struct netdev *, const struct shash *details);
222 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
223 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
225 * The contents of 'details' should be documented as valid for 'ovs_name'
226 * in the "other_config" column in the "Queue" table in
227 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
229 * The implementation should not need to perform any Netlink calls, because
230 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
231 * cached the queue configuration.
233 * This function may be null if 'tc' does not have queues ('n_queues' is
235 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
236 struct shash *details);
238 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
239 * 'details', perfoming any required Netlink calls to complete the
240 * reconfiguration. The caller ensures that 'queue_id' is less than
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 * This function may be null if 'tc' does not have queues or its queues are
248 * not configurable. */
249 int (*class_set)(struct netdev *, unsigned int queue_id,
250 const struct shash *details);
252 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
253 * tc_queue's within 'netdev->tc->queues'.
255 * This function may be null if 'tc' does not have queues or its queues
256 * cannot be deleted. */
257 int (*class_delete)(struct netdev *, struct tc_queue *queue);
259 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
260 * 'struct tc_queue's within 'netdev->tc->queues'.
262 * On success, initializes '*stats'.
264 * This function may be null if 'tc' does not have queues or if it cannot
265 * report queue statistics. */
266 int (*class_get_stats)(const struct netdev *netdev,
267 const struct tc_queue *queue,
268 struct netdev_queue_stats *stats);
270 /* Extracts queue stats from 'nlmsg', which is a response to a
271 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
273 * This function may be null if 'tc' does not have queues or if it cannot
274 * report queue statistics. */
275 int (*class_dump_stats)(const struct netdev *netdev,
276 const struct ofpbuf *nlmsg,
277 netdev_dump_queue_stats_cb *cb, void *aux);
281 tc_init(struct tc *tc, const struct tc_ops *ops)
284 hmap_init(&tc->queues);
288 tc_destroy(struct tc *tc)
290 hmap_destroy(&tc->queues);
293 static const struct tc_ops tc_ops_htb;
294 static const struct tc_ops tc_ops_hfsc;
295 static const struct tc_ops tc_ops_default;
296 static const struct tc_ops tc_ops_other;
298 static const struct tc_ops *tcs[] = {
299 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
300 &tc_ops_hfsc, /* Hierarchical fair service curve. */
301 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
302 &tc_ops_other, /* Some other qdisc. */
306 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
307 static unsigned int tc_get_major(unsigned int handle);
308 static unsigned int tc_get_minor(unsigned int handle);
310 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
311 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
312 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
314 static struct tcmsg *tc_make_request(const struct netdev *, int type,
315 unsigned int flags, struct ofpbuf *);
316 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
318 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
319 struct nlattr **options);
320 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
321 struct nlattr **options,
322 struct netdev_queue_stats *);
323 static int tc_query_class(const struct netdev *,
324 unsigned int handle, unsigned int parent,
325 struct ofpbuf **replyp);
326 static int tc_delete_class(const struct netdev *, unsigned int handle);
328 static int tc_del_qdisc(struct netdev *netdev);
329 static int tc_query_qdisc(const struct netdev *netdev);
331 static int tc_calc_cell_log(unsigned int mtu);
332 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
333 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
334 const struct tc_ratespec *rate);
335 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
337 struct netdev_dev_linux {
338 struct netdev_dev netdev_dev;
340 struct shash_node *shash_node;
341 unsigned int cache_valid;
343 /* The following are figured out "on demand" only. They are only valid
344 * when the corresponding VALID_* bit in 'cache_valid' is set. */
346 uint8_t etheraddr[ETH_ADDR_LEN];
347 struct in_addr address, netmask;
351 bool is_internal; /* Is this an openvswitch internal device? */
352 bool is_tap; /* Is this a tuntap device? */
353 uint32_t kbits_rate; /* Policing data. */
354 uint32_t kbits_burst;
355 bool have_vport_stats;
359 struct tap_state tap;
363 struct netdev_linux {
364 struct netdev netdev;
368 /* An AF_INET socket (used for ioctl operations). */
369 static int af_inet_sock = -1;
371 /* A Netlink routing socket that is not subscribed to any multicast groups. */
372 static struct nl_sock *rtnl_sock;
374 struct netdev_linux_notifier {
375 struct netdev_notifier notifier;
379 static struct shash netdev_linux_notifiers =
380 SHASH_INITIALIZER(&netdev_linux_notifiers);
381 static struct rtnetlink_notifier netdev_linux_poll_notifier;
383 /* This is set pretty low because we probably won't learn anything from the
384 * additional log messages. */
385 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
387 static int netdev_linux_init(void);
389 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
390 int cmd, const char *cmd_name);
391 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
392 const char *cmd_name);
393 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
394 int cmd, const char *cmd_name);
395 static int get_flags(const struct netdev *, int *flagsp);
396 static int set_flags(struct netdev *, int flags);
397 static int do_get_ifindex(const char *netdev_name);
398 static int get_ifindex(const struct netdev *, int *ifindexp);
399 static int do_set_addr(struct netdev *netdev,
400 int ioctl_nr, const char *ioctl_name,
401 struct in_addr addr);
402 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
403 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
404 const uint8_t[ETH_ADDR_LEN]);
405 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
406 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
409 is_netdev_linux_class(const struct netdev_class *netdev_class)
411 return netdev_class->init == netdev_linux_init;
414 static struct netdev_dev_linux *
415 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
417 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
418 assert(is_netdev_linux_class(netdev_class));
420 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
423 static struct netdev_linux *
424 netdev_linux_cast(const struct netdev *netdev)
426 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
427 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
428 assert(is_netdev_linux_class(netdev_class));
430 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
434 netdev_linux_init(void)
436 static int status = -1;
438 /* Create AF_INET socket. */
439 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
440 status = af_inet_sock >= 0 ? 0 : errno;
442 VLOG_ERR("failed to create inet socket: %s", strerror(status));
445 /* Create rtnetlink socket. */
447 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
449 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
458 netdev_linux_run(void)
460 rtnetlink_notifier_run();
464 netdev_linux_wait(void)
466 rtnetlink_notifier_wait();
470 netdev_linux_cache_cb(const struct rtnetlink_change *change,
471 void *aux OVS_UNUSED)
473 struct netdev_dev_linux *dev;
475 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
477 const struct netdev_class *netdev_class =
478 netdev_dev_get_class(base_dev);
480 if (is_netdev_linux_class(netdev_class)) {
481 dev = netdev_dev_linux_cast(base_dev);
482 dev->cache_valid = 0;
486 struct shash device_shash;
487 struct shash_node *node;
489 shash_init(&device_shash);
490 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
491 SHASH_FOR_EACH (node, &device_shash) {
493 dev->cache_valid = 0;
495 shash_destroy(&device_shash);
499 /* Creates system and internal devices. */
501 netdev_linux_create(const struct netdev_class *class,
502 const char *name, const struct shash *args,
503 struct netdev_dev **netdev_devp)
505 struct netdev_dev_linux *netdev_dev;
508 if (!shash_is_empty(args)) {
509 VLOG_WARN("%s: arguments for %s devices should be empty",
513 if (!cache_notifier_refcount) {
514 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
515 netdev_linux_cache_cb, NULL);
520 cache_notifier_refcount++;
522 netdev_dev = xzalloc(sizeof *netdev_dev);
523 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
525 *netdev_devp = &netdev_dev->netdev_dev;
529 /* For most types of netdevs we open the device for each call of
530 * netdev_open(). However, this is not the case with tap devices,
531 * since it is only possible to open the device once. In this
532 * situation we share a single file descriptor, and consequently
533 * buffers, across all readers. Therefore once data is read it will
534 * be unavailable to other reads for tap devices. */
536 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
537 const char *name, const struct shash *args,
538 struct netdev_dev **netdev_devp)
540 struct netdev_dev_linux *netdev_dev;
541 struct tap_state *state;
542 static const char tap_dev[] = "/dev/net/tun";
546 if (!shash_is_empty(args)) {
547 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
550 netdev_dev = xzalloc(sizeof *netdev_dev);
551 state = &netdev_dev->state.tap;
553 /* Open tap device. */
554 state->fd = open(tap_dev, O_RDWR);
557 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
561 /* Create tap device. */
562 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
563 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
564 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
565 VLOG_WARN("%s: creating tap device failed: %s", name,
571 /* Make non-blocking. */
572 error = set_nonblocking(state->fd);
577 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
578 *netdev_devp = &netdev_dev->netdev_dev;
587 destroy_tap(struct netdev_dev_linux *netdev_dev)
589 struct tap_state *state = &netdev_dev->state.tap;
591 if (state->fd >= 0) {
596 /* Destroys the netdev device 'netdev_dev_'. */
598 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
600 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
601 const char *type = netdev_dev_get_type(netdev_dev_);
603 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
604 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
607 if (!strcmp(type, "system")) {
608 cache_notifier_refcount--;
610 if (!cache_notifier_refcount) {
611 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
613 } else if (!strcmp(type, "tap")) {
614 destroy_tap(netdev_dev);
621 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
622 struct netdev **netdevp)
624 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
625 struct netdev_linux *netdev;
626 enum netdev_flags flags;
629 /* Allocate network device. */
630 netdev = xzalloc(sizeof *netdev);
632 netdev_init(&netdev->netdev, netdev_dev_);
634 /* Verify that the device really exists, by attempting to read its flags.
635 * (The flags might be cached, in which case this won't actually do an
638 * Don't do this for "internal" netdevs, though, because those have to be
639 * created as netdev objects before they exist in the kernel, because
640 * creating them in the kernel happens by passing a netdev object to
641 * dpif_port_add(). */
642 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
643 error = netdev_get_flags(&netdev->netdev, &flags);
644 if (error == ENODEV) {
649 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
650 !netdev_dev->state.tap.opened) {
652 /* We assume that the first user of the tap device is the primary user
653 * and give them the tap FD. Subsequent users probably just expect
654 * this to be a system device so open it normally to avoid send/receive
655 * directions appearing to be reversed. */
656 netdev->fd = netdev_dev->state.tap.fd;
657 netdev_dev->state.tap.opened = true;
658 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
659 struct sockaddr_ll sll;
663 /* Create file descriptor. */
664 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
665 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
667 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
668 if (netdev->fd < 0) {
673 /* Set non-blocking mode. */
674 error = set_nonblocking(netdev->fd);
679 /* Get ethernet device index. */
680 error = get_ifindex(&netdev->netdev, &ifindex);
685 /* Bind to specific ethernet device. */
686 memset(&sll, 0, sizeof sll);
687 sll.sll_family = AF_PACKET;
688 sll.sll_ifindex = ifindex;
690 (struct sockaddr *) &sll, sizeof sll) < 0) {
692 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
697 /* Between the socket() and bind() calls above, the socket receives all
698 * packets of the requested type on all system interfaces. We do not
699 * want to receive that data, but there is no way to avoid it. So we
700 * must now drain out the receive queue. */
701 error = drain_rcvbuf(netdev->fd);
707 *netdevp = &netdev->netdev;
711 netdev_uninit(&netdev->netdev, true);
715 /* Closes and destroys 'netdev'. */
717 netdev_linux_close(struct netdev *netdev_)
719 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
721 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
727 /* Initializes 'svec' with a list of the names of all known network devices. */
729 netdev_linux_enumerate(struct svec *svec)
731 struct if_nameindex *names;
733 names = if_nameindex();
737 for (i = 0; names[i].if_name != NULL; i++) {
738 svec_add(svec, names[i].if_name);
740 if_freenameindex(names);
743 VLOG_WARN("could not obtain list of network device names: %s",
750 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
752 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
754 if (netdev->fd < 0) {
755 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
760 ssize_t retval = read(netdev->fd, data, size);
763 } else if (errno != EINTR) {
764 if (errno != EAGAIN) {
765 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
766 strerror(errno), netdev_get_name(netdev_));
773 /* Registers with the poll loop to wake up from the next call to poll_block()
774 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
776 netdev_linux_recv_wait(struct netdev *netdev_)
778 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
779 if (netdev->fd >= 0) {
780 poll_fd_wait(netdev->fd, POLLIN);
784 /* Discards all packets waiting to be received from 'netdev'. */
786 netdev_linux_drain(struct netdev *netdev_)
788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789 if (netdev->fd < 0) {
791 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
793 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
794 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
798 drain_fd(netdev->fd, ifr.ifr_qlen);
801 return drain_rcvbuf(netdev->fd);
805 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
806 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
807 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
808 * the packet is too big or too small to transmit on the device.
810 * The caller retains ownership of 'buffer' in all cases.
812 * The kernel maintains a packet transmission queue, so the caller is not
813 * expected to do additional queuing of packets. */
815 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
817 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
819 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
821 if (netdev->fd < 0) {
826 ssize_t retval = write(netdev->fd, data, size);
828 /* The Linux AF_PACKET implementation never blocks waiting for room
829 * for packets, instead returning ENOBUFS. Translate this into
830 * EAGAIN for the caller. */
831 if (errno == ENOBUFS) {
833 } else if (errno == EINTR) {
835 } else if (errno != EAGAIN) {
836 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
837 netdev_get_name(netdev_), strerror(errno));
840 } else if (retval != size) {
841 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
842 "%zu) on %s", retval, size, netdev_get_name(netdev_));
850 /* Registers with the poll loop to wake up from the next call to poll_block()
851 * when the packet transmission queue has sufficient room to transmit a packet
852 * with netdev_send().
854 * The kernel maintains a packet transmission queue, so the client is not
855 * expected to do additional queuing of packets. Thus, this function is
856 * unlikely to ever be used. It is included for completeness. */
858 netdev_linux_send_wait(struct netdev *netdev_)
860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
861 if (netdev->fd < 0) {
863 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
864 poll_fd_wait(netdev->fd, POLLOUT);
866 /* TAP device always accepts packets.*/
867 poll_immediate_wake();
871 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
872 * otherwise a positive errno value. */
874 netdev_linux_set_etheraddr(struct netdev *netdev_,
875 const uint8_t mac[ETH_ADDR_LEN])
877 struct netdev_dev_linux *netdev_dev =
878 netdev_dev_linux_cast(netdev_get_dev(netdev_));
881 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
882 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
883 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
885 netdev_dev->cache_valid |= VALID_ETHERADDR;
886 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
894 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
895 * free the returned buffer. */
897 netdev_linux_get_etheraddr(const struct netdev *netdev_,
898 uint8_t mac[ETH_ADDR_LEN])
900 struct netdev_dev_linux *netdev_dev =
901 netdev_dev_linux_cast(netdev_get_dev(netdev_));
902 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
903 int error = get_etheraddr(netdev_get_name(netdev_),
904 netdev_dev->etheraddr);
908 netdev_dev->cache_valid |= VALID_ETHERADDR;
910 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
914 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
915 * in bytes, not including the hardware header; thus, this is typically 1500
916 * bytes for Ethernet devices. */
918 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
920 struct netdev_dev_linux *netdev_dev =
921 netdev_dev_linux_cast(netdev_get_dev(netdev_));
922 if (!(netdev_dev->cache_valid & VALID_MTU)) {
926 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
927 SIOCGIFMTU, "SIOCGIFMTU");
931 netdev_dev->mtu = ifr.ifr_mtu;
932 netdev_dev->cache_valid |= VALID_MTU;
934 *mtup = netdev_dev->mtu;
938 /* Returns the ifindex of 'netdev', if successful, as a positive number.
939 * On failure, returns a negative errno value. */
941 netdev_linux_get_ifindex(const struct netdev *netdev)
945 error = get_ifindex(netdev, &ifindex);
946 return error ? -error : ifindex;
950 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
952 struct netdev_dev_linux *netdev_dev =
953 netdev_dev_linux_cast(netdev_get_dev(netdev_));
958 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
962 fn = xasprintf("/sys/class/net/%s/carrier",
963 netdev_get_name(netdev_));
964 fd = open(fn, O_RDONLY);
967 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
971 retval = read(fd, line, sizeof line);
974 if (error == EINVAL) {
975 /* This is the normal return value when we try to check carrier
976 * if the network device is not up. */
978 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
981 } else if (retval == 0) {
983 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
987 if (line[0] != '0' && line[0] != '1') {
989 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
993 netdev_dev->carrier = line[0] != '0';
994 netdev_dev->cache_valid |= VALID_CARRIER;
996 *carrier = netdev_dev->carrier;
1007 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1008 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1011 check_for_working_netlink_stats(void)
1013 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1014 * preferable, so if that works, we'll use it. */
1015 int ifindex = do_get_ifindex("lo");
1017 VLOG_WARN("failed to get ifindex for lo, "
1018 "obtaining netdev stats from proc");
1021 struct netdev_stats stats;
1022 int error = get_stats_via_netlink(ifindex, &stats);
1024 VLOG_DBG("obtaining netdev stats via rtnetlink");
1027 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1028 "via proc (you are probably running a pre-2.6.19 "
1029 "kernel)", strerror(error));
1035 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1037 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1039 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1040 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1041 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1043 netdev_dev->is_tap = !strcmp(type, "tap");
1044 netdev_dev->is_internal = false;
1045 if (!netdev_dev->is_tap) {
1046 struct ethtool_drvinfo drvinfo;
1049 memset(&drvinfo, 0, sizeof drvinfo);
1050 error = netdev_linux_do_ethtool(name,
1051 (struct ethtool_cmd *)&drvinfo,
1053 "ETHTOOL_GDRVINFO");
1055 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1056 netdev_dev->is_internal = true;
1060 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1065 swap_uint64(uint64_t *a, uint64_t *b)
1072 /* Retrieves current device stats for 'netdev'. */
1074 netdev_linux_get_stats(const struct netdev *netdev_,
1075 struct netdev_stats *stats)
1077 struct netdev_dev_linux *netdev_dev =
1078 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1079 static int use_netlink_stats = -1;
1082 if (netdev_dev->have_vport_stats ||
1083 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1085 error = netdev_vport_get_stats(netdev_, stats);
1086 netdev_dev->have_vport_stats = !error;
1087 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1090 if (!netdev_dev->have_vport_stats) {
1091 if (use_netlink_stats < 0) {
1092 use_netlink_stats = check_for_working_netlink_stats();
1094 if (use_netlink_stats) {
1097 error = get_ifindex(netdev_, &ifindex);
1099 error = get_stats_via_netlink(ifindex, stats);
1102 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1106 /* If this port is an internal port then the transmit and receive stats
1107 * will appear to be swapped relative to the other ports since we are the
1108 * one sending the data, not a remote computer. For consistency, we swap
1109 * them back here. This does not apply if we are getting stats from the
1110 * vport layer because it always tracks stats from the perspective of the
1112 netdev_linux_update_is_pseudo(netdev_dev);
1113 if (!error && !netdev_dev->have_vport_stats &&
1114 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1115 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1116 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1117 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1118 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1119 stats->rx_length_errors = 0;
1120 stats->rx_over_errors = 0;
1121 stats->rx_crc_errors = 0;
1122 stats->rx_frame_errors = 0;
1123 stats->rx_fifo_errors = 0;
1124 stats->rx_missed_errors = 0;
1125 stats->tx_aborted_errors = 0;
1126 stats->tx_carrier_errors = 0;
1127 stats->tx_fifo_errors = 0;
1128 stats->tx_heartbeat_errors = 0;
1129 stats->tx_window_errors = 0;
1135 /* Stores the features supported by 'netdev' into each of '*current',
1136 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1137 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1138 * successful, otherwise a positive errno value. */
1140 netdev_linux_get_features(struct netdev *netdev,
1141 uint32_t *current, uint32_t *advertised,
1142 uint32_t *supported, uint32_t *peer)
1144 struct ethtool_cmd ecmd;
1147 memset(&ecmd, 0, sizeof ecmd);
1148 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1149 ETHTOOL_GSET, "ETHTOOL_GSET");
1154 /* Supported features. */
1156 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1157 *supported |= OFPPF_10MB_HD;
1159 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1160 *supported |= OFPPF_10MB_FD;
1162 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1163 *supported |= OFPPF_100MB_HD;
1165 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1166 *supported |= OFPPF_100MB_FD;
1168 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1169 *supported |= OFPPF_1GB_HD;
1171 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1172 *supported |= OFPPF_1GB_FD;
1174 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1175 *supported |= OFPPF_10GB_FD;
1177 if (ecmd.supported & SUPPORTED_TP) {
1178 *supported |= OFPPF_COPPER;
1180 if (ecmd.supported & SUPPORTED_FIBRE) {
1181 *supported |= OFPPF_FIBER;
1183 if (ecmd.supported & SUPPORTED_Autoneg) {
1184 *supported |= OFPPF_AUTONEG;
1186 if (ecmd.supported & SUPPORTED_Pause) {
1187 *supported |= OFPPF_PAUSE;
1189 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1190 *supported |= OFPPF_PAUSE_ASYM;
1193 /* Advertised features. */
1195 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1196 *advertised |= OFPPF_10MB_HD;
1198 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1199 *advertised |= OFPPF_10MB_FD;
1201 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1202 *advertised |= OFPPF_100MB_HD;
1204 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1205 *advertised |= OFPPF_100MB_FD;
1207 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1208 *advertised |= OFPPF_1GB_HD;
1210 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1211 *advertised |= OFPPF_1GB_FD;
1213 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1214 *advertised |= OFPPF_10GB_FD;
1216 if (ecmd.advertising & ADVERTISED_TP) {
1217 *advertised |= OFPPF_COPPER;
1219 if (ecmd.advertising & ADVERTISED_FIBRE) {
1220 *advertised |= OFPPF_FIBER;
1222 if (ecmd.advertising & ADVERTISED_Autoneg) {
1223 *advertised |= OFPPF_AUTONEG;
1225 if (ecmd.advertising & ADVERTISED_Pause) {
1226 *advertised |= OFPPF_PAUSE;
1228 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1229 *advertised |= OFPPF_PAUSE_ASYM;
1232 /* Current settings. */
1233 if (ecmd.speed == SPEED_10) {
1234 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1235 } else if (ecmd.speed == SPEED_100) {
1236 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1237 } else if (ecmd.speed == SPEED_1000) {
1238 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1239 } else if (ecmd.speed == SPEED_10000) {
1240 *current = OFPPF_10GB_FD;
1245 if (ecmd.port == PORT_TP) {
1246 *current |= OFPPF_COPPER;
1247 } else if (ecmd.port == PORT_FIBRE) {
1248 *current |= OFPPF_FIBER;
1252 *current |= OFPPF_AUTONEG;
1255 /* Peer advertisements. */
1256 *peer = 0; /* XXX */
1261 /* Set the features advertised by 'netdev' to 'advertise'. */
1263 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1265 struct ethtool_cmd ecmd;
1268 memset(&ecmd, 0, sizeof ecmd);
1269 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1270 ETHTOOL_GSET, "ETHTOOL_GSET");
1275 ecmd.advertising = 0;
1276 if (advertise & OFPPF_10MB_HD) {
1277 ecmd.advertising |= ADVERTISED_10baseT_Half;
1279 if (advertise & OFPPF_10MB_FD) {
1280 ecmd.advertising |= ADVERTISED_10baseT_Full;
1282 if (advertise & OFPPF_100MB_HD) {
1283 ecmd.advertising |= ADVERTISED_100baseT_Half;
1285 if (advertise & OFPPF_100MB_FD) {
1286 ecmd.advertising |= ADVERTISED_100baseT_Full;
1288 if (advertise & OFPPF_1GB_HD) {
1289 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1291 if (advertise & OFPPF_1GB_FD) {
1292 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1294 if (advertise & OFPPF_10GB_FD) {
1295 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1297 if (advertise & OFPPF_COPPER) {
1298 ecmd.advertising |= ADVERTISED_TP;
1300 if (advertise & OFPPF_FIBER) {
1301 ecmd.advertising |= ADVERTISED_FIBRE;
1303 if (advertise & OFPPF_AUTONEG) {
1304 ecmd.advertising |= ADVERTISED_Autoneg;
1306 if (advertise & OFPPF_PAUSE) {
1307 ecmd.advertising |= ADVERTISED_Pause;
1309 if (advertise & OFPPF_PAUSE_ASYM) {
1310 ecmd.advertising |= ADVERTISED_Asym_Pause;
1312 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1313 ETHTOOL_SSET, "ETHTOOL_SSET");
1316 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1317 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1318 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1319 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1320 * sets '*vlan_vid' to -1. */
1322 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1324 const char *netdev_name = netdev_get_name(netdev);
1325 struct ds line = DS_EMPTY_INITIALIZER;
1326 FILE *stream = NULL;
1330 COVERAGE_INC(netdev_get_vlan_vid);
1331 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1332 stream = fopen(fn, "r");
1338 if (ds_get_line(&line, stream)) {
1339 if (ferror(stream)) {
1341 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1344 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1349 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1351 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1352 fn, ds_cstr(&line));
1370 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1371 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1373 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1374 * positive errno value.
1376 * This function is equivalent to running
1377 * /sbin/tc qdisc del dev %s handle ffff: ingress
1378 * but it is much, much faster.
1381 netdev_linux_remove_policing(struct netdev *netdev)
1383 struct netdev_dev_linux *netdev_dev =
1384 netdev_dev_linux_cast(netdev_get_dev(netdev));
1385 const char *netdev_name = netdev_get_name(netdev);
1387 struct ofpbuf request;
1388 struct tcmsg *tcmsg;
1391 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1395 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1396 tcmsg->tcm_parent = TC_H_INGRESS;
1397 nl_msg_put_string(&request, TCA_KIND, "ingress");
1398 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1400 error = tc_transact(&request, NULL);
1401 if (error && error != ENOENT && error != EINVAL) {
1402 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1403 netdev_name, strerror(error));
1407 netdev_dev->kbits_rate = 0;
1408 netdev_dev->kbits_burst = 0;
1409 netdev_dev->cache_valid |= VALID_POLICING;
1413 /* Attempts to set input rate limiting (policing) policy. */
1415 netdev_linux_set_policing(struct netdev *netdev,
1416 uint32_t kbits_rate, uint32_t kbits_burst)
1418 struct netdev_dev_linux *netdev_dev =
1419 netdev_dev_linux_cast(netdev_get_dev(netdev));
1420 const char *netdev_name = netdev_get_name(netdev);
1423 COVERAGE_INC(netdev_set_policing);
1425 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1426 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1427 : kbits_burst); /* Stick with user-specified value. */
1429 if (netdev_dev->cache_valid & VALID_POLICING
1430 && netdev_dev->kbits_rate == kbits_rate
1431 && netdev_dev->kbits_burst == kbits_burst) {
1432 /* Assume that settings haven't changed since we last set them. */
1436 netdev_linux_remove_policing(netdev);
1438 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1439 if (system(command) != 0) {
1440 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1444 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1445 kbits_rate, kbits_burst);
1446 if (system(command) != 0) {
1447 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1452 netdev_dev->kbits_rate = kbits_rate;
1453 netdev_dev->kbits_burst = kbits_burst;
1454 netdev_dev->cache_valid |= VALID_POLICING;
1461 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1464 const struct tc_ops **opsp;
1466 for (opsp = tcs; *opsp != NULL; opsp++) {
1467 const struct tc_ops *ops = *opsp;
1468 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1469 svec_add(types, ops->ovs_name);
1475 static const struct tc_ops *
1476 tc_lookup_ovs_name(const char *name)
1478 const struct tc_ops **opsp;
1480 for (opsp = tcs; *opsp != NULL; opsp++) {
1481 const struct tc_ops *ops = *opsp;
1482 if (!strcmp(name, ops->ovs_name)) {
1489 static const struct tc_ops *
1490 tc_lookup_linux_name(const char *name)
1492 const struct tc_ops **opsp;
1494 for (opsp = tcs; *opsp != NULL; opsp++) {
1495 const struct tc_ops *ops = *opsp;
1496 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1503 static struct tc_queue *
1504 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1507 struct netdev_dev_linux *netdev_dev =
1508 netdev_dev_linux_cast(netdev_get_dev(netdev));
1509 struct tc_queue *queue;
1511 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1512 if (queue->queue_id == queue_id) {
1519 static struct tc_queue *
1520 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1522 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1526 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1528 struct netdev_qos_capabilities *caps)
1530 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1534 caps->n_queues = ops->n_queues;
1539 netdev_linux_get_qos(const struct netdev *netdev,
1540 const char **typep, struct shash *details)
1542 struct netdev_dev_linux *netdev_dev =
1543 netdev_dev_linux_cast(netdev_get_dev(netdev));
1546 error = tc_query_qdisc(netdev);
1551 *typep = netdev_dev->tc->ops->ovs_name;
1552 return (netdev_dev->tc->ops->qdisc_get
1553 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1558 netdev_linux_set_qos(struct netdev *netdev,
1559 const char *type, const struct shash *details)
1561 struct netdev_dev_linux *netdev_dev =
1562 netdev_dev_linux_cast(netdev_get_dev(netdev));
1563 const struct tc_ops *new_ops;
1566 new_ops = tc_lookup_ovs_name(type);
1567 if (!new_ops || !new_ops->tc_install) {
1571 error = tc_query_qdisc(netdev);
1576 if (new_ops == netdev_dev->tc->ops) {
1577 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1579 /* Delete existing qdisc. */
1580 error = tc_del_qdisc(netdev);
1584 assert(netdev_dev->tc == NULL);
1586 /* Install new qdisc. */
1587 error = new_ops->tc_install(netdev, details);
1588 assert((error == 0) == (netdev_dev->tc != NULL));
1595 netdev_linux_get_queue(const struct netdev *netdev,
1596 unsigned int queue_id, struct shash *details)
1598 struct netdev_dev_linux *netdev_dev =
1599 netdev_dev_linux_cast(netdev_get_dev(netdev));
1602 error = tc_query_qdisc(netdev);
1606 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1608 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1614 netdev_linux_set_queue(struct netdev *netdev,
1615 unsigned int queue_id, const struct shash *details)
1617 struct netdev_dev_linux *netdev_dev =
1618 netdev_dev_linux_cast(netdev_get_dev(netdev));
1621 error = tc_query_qdisc(netdev);
1624 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1625 || !netdev_dev->tc->ops->class_set) {
1629 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1633 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1635 struct netdev_dev_linux *netdev_dev =
1636 netdev_dev_linux_cast(netdev_get_dev(netdev));
1639 error = tc_query_qdisc(netdev);
1642 } else if (!netdev_dev->tc->ops->class_delete) {
1645 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1647 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1653 netdev_linux_get_queue_stats(const struct netdev *netdev,
1654 unsigned int queue_id,
1655 struct netdev_queue_stats *stats)
1657 struct netdev_dev_linux *netdev_dev =
1658 netdev_dev_linux_cast(netdev_get_dev(netdev));
1661 error = tc_query_qdisc(netdev);
1664 } else if (!netdev_dev->tc->ops->class_get_stats) {
1667 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1669 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1675 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1677 struct ofpbuf request;
1678 struct tcmsg *tcmsg;
1680 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1684 tcmsg->tcm_parent = 0;
1685 nl_dump_start(dump, rtnl_sock, &request);
1686 ofpbuf_uninit(&request);
1691 netdev_linux_dump_queues(const struct netdev *netdev,
1692 netdev_dump_queues_cb *cb, void *aux)
1694 struct netdev_dev_linux *netdev_dev =
1695 netdev_dev_linux_cast(netdev_get_dev(netdev));
1696 struct tc_queue *queue;
1697 struct shash details;
1701 error = tc_query_qdisc(netdev);
1704 } else if (!netdev_dev->tc->ops->class_get) {
1709 shash_init(&details);
1710 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1711 shash_clear(&details);
1713 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1715 (*cb)(queue->queue_id, &details, aux);
1720 shash_destroy(&details);
1726 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1727 netdev_dump_queue_stats_cb *cb, void *aux)
1729 struct netdev_dev_linux *netdev_dev =
1730 netdev_dev_linux_cast(netdev_get_dev(netdev));
1731 struct nl_dump dump;
1736 error = tc_query_qdisc(netdev);
1739 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1744 if (!start_queue_dump(netdev, &dump)) {
1747 while (nl_dump_next(&dump, &msg)) {
1748 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1754 error = nl_dump_done(&dump);
1755 return error ? error : last_error;
1759 netdev_linux_get_in4(const struct netdev *netdev_,
1760 struct in_addr *address, struct in_addr *netmask)
1762 struct netdev_dev_linux *netdev_dev =
1763 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1765 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1768 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1769 SIOCGIFADDR, "SIOCGIFADDR");
1774 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1775 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1780 netdev_dev->cache_valid |= VALID_IN4;
1782 *address = netdev_dev->address;
1783 *netmask = netdev_dev->netmask;
1784 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1788 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1789 struct in_addr netmask)
1791 struct netdev_dev_linux *netdev_dev =
1792 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1795 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1797 netdev_dev->cache_valid |= VALID_IN4;
1798 netdev_dev->address = address;
1799 netdev_dev->netmask = netmask;
1800 if (address.s_addr != INADDR_ANY) {
1801 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1802 "SIOCSIFNETMASK", netmask);
1809 parse_if_inet6_line(const char *line,
1810 struct in6_addr *in6, char ifname[16 + 1])
1812 uint8_t *s6 = in6->s6_addr;
1813 #define X8 "%2"SCNx8
1815 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1816 "%*x %*x %*x %*x %16s\n",
1817 &s6[0], &s6[1], &s6[2], &s6[3],
1818 &s6[4], &s6[5], &s6[6], &s6[7],
1819 &s6[8], &s6[9], &s6[10], &s6[11],
1820 &s6[12], &s6[13], &s6[14], &s6[15],
1824 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1825 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1827 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1829 struct netdev_dev_linux *netdev_dev =
1830 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1831 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1835 netdev_dev->in6 = in6addr_any;
1837 file = fopen("/proc/net/if_inet6", "r");
1839 const char *name = netdev_get_name(netdev_);
1840 while (fgets(line, sizeof line, file)) {
1841 struct in6_addr in6_tmp;
1842 char ifname[16 + 1];
1843 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1844 && !strcmp(name, ifname))
1846 netdev_dev->in6 = in6_tmp;
1852 netdev_dev->cache_valid |= VALID_IN6;
1854 *in6 = netdev_dev->in6;
1859 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1861 struct sockaddr_in sin;
1862 memset(&sin, 0, sizeof sin);
1863 sin.sin_family = AF_INET;
1864 sin.sin_addr = addr;
1867 memset(sa, 0, sizeof *sa);
1868 memcpy(sa, &sin, sizeof sin);
1872 do_set_addr(struct netdev *netdev,
1873 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1876 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1877 make_in4_sockaddr(&ifr.ifr_addr, addr);
1879 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1883 /* Adds 'router' as a default IP gateway. */
1885 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1887 struct in_addr any = { INADDR_ANY };
1891 memset(&rt, 0, sizeof rt);
1892 make_in4_sockaddr(&rt.rt_dst, any);
1893 make_in4_sockaddr(&rt.rt_gateway, router);
1894 make_in4_sockaddr(&rt.rt_genmask, any);
1895 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1896 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1898 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1904 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1907 static const char fn[] = "/proc/net/route";
1912 *netdev_name = NULL;
1913 stream = fopen(fn, "r");
1914 if (stream == NULL) {
1915 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1920 while (fgets(line, sizeof line, stream)) {
1923 uint32_t dest, gateway, mask;
1924 int refcnt, metric, mtu;
1925 unsigned int flags, use, window, irtt;
1928 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1930 iface, &dest, &gateway, &flags, &refcnt,
1931 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1933 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1937 if (!(flags & RTF_UP)) {
1938 /* Skip routes that aren't up. */
1942 /* The output of 'dest', 'mask', and 'gateway' were given in
1943 * network byte order, so we don't need need any endian
1944 * conversions here. */
1945 if ((dest & mask) == (host->s_addr & mask)) {
1947 /* The host is directly reachable. */
1948 next_hop->s_addr = 0;
1950 /* To reach the host, we must go through a gateway. */
1951 next_hop->s_addr = gateway;
1953 *netdev_name = xstrdup(iface);
1964 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1965 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1966 * returns 0. Otherwise, it returns a positive errno value; in particular,
1967 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1969 netdev_linux_arp_lookup(const struct netdev *netdev,
1970 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1973 struct sockaddr_in sin;
1976 memset(&r, 0, sizeof r);
1977 sin.sin_family = AF_INET;
1978 sin.sin_addr.s_addr = ip;
1980 memcpy(&r.arp_pa, &sin, sizeof sin);
1981 r.arp_ha.sa_family = ARPHRD_ETHER;
1983 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1984 COVERAGE_INC(netdev_arp_lookup);
1985 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1987 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1988 } else if (retval != ENXIO) {
1989 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1990 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1996 nd_to_iff_flags(enum netdev_flags nd)
1999 if (nd & NETDEV_UP) {
2002 if (nd & NETDEV_PROMISC) {
2009 iff_to_nd_flags(int iff)
2011 enum netdev_flags nd = 0;
2015 if (iff & IFF_PROMISC) {
2016 nd |= NETDEV_PROMISC;
2022 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2023 enum netdev_flags on, enum netdev_flags *old_flagsp)
2025 int old_flags, new_flags;
2028 error = get_flags(netdev, &old_flags);
2030 *old_flagsp = iff_to_nd_flags(old_flags);
2031 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2032 if (new_flags != old_flags) {
2033 error = set_flags(netdev, new_flags);
2040 poll_notify(struct list *list)
2042 struct netdev_linux_notifier *notifier;
2043 LIST_FOR_EACH (notifier, node, list) {
2044 struct netdev_notifier *n = ¬ifier->notifier;
2050 netdev_linux_poll_cb(const struct rtnetlink_change *change,
2051 void *aux OVS_UNUSED)
2054 struct list *list = shash_find_data(&netdev_linux_notifiers,
2060 struct shash_node *node;
2061 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2062 poll_notify(node->data);
2068 netdev_linux_poll_add(struct netdev *netdev,
2069 void (*cb)(struct netdev_notifier *), void *aux,
2070 struct netdev_notifier **notifierp)
2072 const char *netdev_name = netdev_get_name(netdev);
2073 struct netdev_linux_notifier *notifier;
2076 if (shash_is_empty(&netdev_linux_notifiers)) {
2077 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2078 netdev_linux_poll_cb, NULL);
2084 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2086 list = xmalloc(sizeof *list);
2088 shash_add(&netdev_linux_notifiers, netdev_name, list);
2091 notifier = xmalloc(sizeof *notifier);
2092 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2093 list_push_back(list, ¬ifier->node);
2094 *notifierp = ¬ifier->notifier;
2099 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2101 struct netdev_linux_notifier *notifier =
2102 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2105 /* Remove 'notifier' from its list. */
2106 list = list_remove(¬ifier->node);
2107 if (list_is_empty(list)) {
2108 /* The list is now empty. Remove it from the hash and free it. */
2109 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2110 shash_delete(&netdev_linux_notifiers,
2111 shash_find(&netdev_linux_notifiers, netdev_name));
2116 /* If that was the last notifier, unregister. */
2117 if (shash_is_empty(&netdev_linux_notifiers)) {
2118 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2122 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2126 netdev_linux_init, \
2128 netdev_linux_wait, \
2131 netdev_linux_destroy, \
2132 NULL, /* reconfigure */ \
2134 netdev_linux_open, \
2135 netdev_linux_close, \
2139 netdev_linux_recv, \
2140 netdev_linux_recv_wait, \
2141 netdev_linux_drain, \
2143 netdev_linux_send, \
2144 netdev_linux_send_wait, \
2146 netdev_linux_set_etheraddr, \
2147 netdev_linux_get_etheraddr, \
2148 netdev_linux_get_mtu, \
2149 netdev_linux_get_ifindex, \
2150 netdev_linux_get_carrier, \
2151 netdev_linux_get_stats, \
2154 netdev_linux_get_features, \
2155 netdev_linux_set_advertisements, \
2156 netdev_linux_get_vlan_vid, \
2158 netdev_linux_set_policing, \
2159 netdev_linux_get_qos_types, \
2160 netdev_linux_get_qos_capabilities, \
2161 netdev_linux_get_qos, \
2162 netdev_linux_set_qos, \
2163 netdev_linux_get_queue, \
2164 netdev_linux_set_queue, \
2165 netdev_linux_delete_queue, \
2166 netdev_linux_get_queue_stats, \
2167 netdev_linux_dump_queues, \
2168 netdev_linux_dump_queue_stats, \
2170 netdev_linux_get_in4, \
2171 netdev_linux_set_in4, \
2172 netdev_linux_get_in6, \
2173 netdev_linux_add_router, \
2174 netdev_linux_get_next_hop, \
2175 netdev_linux_arp_lookup, \
2177 netdev_linux_update_flags, \
2179 netdev_linux_poll_add, \
2180 netdev_linux_poll_remove \
2183 const struct netdev_class netdev_linux_class =
2186 netdev_linux_create,
2187 netdev_linux_enumerate,
2188 NULL); /* set_stats */
2190 const struct netdev_class netdev_tap_class =
2193 netdev_linux_create_tap,
2194 NULL, /* enumerate */
2195 NULL); /* set_stats */
2197 const struct netdev_class netdev_internal_class =
2200 netdev_linux_create,
2201 NULL, /* enumerate */
2202 netdev_vport_set_stats);
2204 /* HTB traffic control class. */
2206 #define HTB_N_QUEUES 0xf000
2210 unsigned int max_rate; /* In bytes/s. */
2214 struct tc_queue tc_queue;
2215 unsigned int min_rate; /* In bytes/s. */
2216 unsigned int max_rate; /* In bytes/s. */
2217 unsigned int burst; /* In bytes. */
2218 unsigned int priority; /* Lower values are higher priorities. */
2222 htb_get__(const struct netdev *netdev)
2224 struct netdev_dev_linux *netdev_dev =
2225 netdev_dev_linux_cast(netdev_get_dev(netdev));
2226 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2230 htb_install__(struct netdev *netdev, uint64_t max_rate)
2232 struct netdev_dev_linux *netdev_dev =
2233 netdev_dev_linux_cast(netdev_get_dev(netdev));
2236 htb = xmalloc(sizeof *htb);
2237 tc_init(&htb->tc, &tc_ops_htb);
2238 htb->max_rate = max_rate;
2240 netdev_dev->tc = &htb->tc;
2245 /* Create an HTB qdisc.
2247 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2249 htb_setup_qdisc__(struct netdev *netdev)
2252 struct tc_htb_glob opt;
2253 struct ofpbuf request;
2254 struct tcmsg *tcmsg;
2256 tc_del_qdisc(netdev);
2258 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2259 NLM_F_EXCL | NLM_F_CREATE, &request);
2263 tcmsg->tcm_handle = tc_make_handle(1, 0);
2264 tcmsg->tcm_parent = TC_H_ROOT;
2266 nl_msg_put_string(&request, TCA_KIND, "htb");
2268 memset(&opt, 0, sizeof opt);
2269 opt.rate2quantum = 10;
2273 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2274 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2275 nl_msg_end_nested(&request, opt_offset);
2277 return tc_transact(&request, NULL);
2280 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2281 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2283 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2284 unsigned int parent, struct htb_class *class)
2287 struct tc_htb_opt opt;
2288 struct ofpbuf request;
2289 struct tcmsg *tcmsg;
2293 netdev_get_mtu(netdev, &mtu);
2295 memset(&opt, 0, sizeof opt);
2296 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2297 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2298 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2299 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2300 opt.prio = class->priority;
2302 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2306 tcmsg->tcm_handle = handle;
2307 tcmsg->tcm_parent = parent;
2309 nl_msg_put_string(&request, TCA_KIND, "htb");
2310 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2311 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2312 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2313 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2314 nl_msg_end_nested(&request, opt_offset);
2316 error = tc_transact(&request, NULL);
2318 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2319 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2320 netdev_get_name(netdev),
2321 tc_get_major(handle), tc_get_minor(handle),
2322 tc_get_major(parent), tc_get_minor(parent),
2323 class->min_rate, class->max_rate,
2324 class->burst, class->priority, strerror(error));
2329 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2330 * description of them into 'details'. The description complies with the
2331 * specification given in the vswitch database documentation for linux-htb
2334 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2336 static const struct nl_policy tca_htb_policy[] = {
2337 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2338 .min_len = sizeof(struct tc_htb_opt) },
2341 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2342 const struct tc_htb_opt *htb;
2344 if (!nl_parse_nested(nl_options, tca_htb_policy,
2345 attrs, ARRAY_SIZE(tca_htb_policy))) {
2346 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2350 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2351 class->min_rate = htb->rate.rate;
2352 class->max_rate = htb->ceil.rate;
2353 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2354 class->priority = htb->prio;
2359 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2360 struct htb_class *options,
2361 struct netdev_queue_stats *stats)
2363 struct nlattr *nl_options;
2364 unsigned int handle;
2367 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2368 if (!error && queue_id) {
2369 unsigned int major = tc_get_major(handle);
2370 unsigned int minor = tc_get_minor(handle);
2371 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2372 *queue_id = minor - 1;
2377 if (!error && options) {
2378 error = htb_parse_tca_options__(nl_options, options);
2384 htb_parse_qdisc_details__(struct netdev *netdev,
2385 const struct shash *details, struct htb_class *hc)
2387 const char *max_rate_s;
2389 max_rate_s = shash_find_data(details, "max-rate");
2390 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2391 if (!hc->max_rate) {
2394 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2395 hc->max_rate = netdev_features_to_bps(current) / 8;
2397 hc->min_rate = hc->max_rate;
2403 htb_parse_class_details__(struct netdev *netdev,
2404 const struct shash *details, struct htb_class *hc)
2406 const struct htb *htb = htb_get__(netdev);
2407 const char *min_rate_s = shash_find_data(details, "min-rate");
2408 const char *max_rate_s = shash_find_data(details, "max-rate");
2409 const char *burst_s = shash_find_data(details, "burst");
2410 const char *priority_s = shash_find_data(details, "priority");
2413 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2415 /* min-rate is required. */
2418 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2419 hc->min_rate = MAX(hc->min_rate, 1500);
2420 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2423 hc->max_rate = (max_rate_s
2424 ? strtoull(max_rate_s, NULL, 10) / 8
2426 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2427 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2431 * According to hints in the documentation that I've read, it is important
2432 * that 'burst' be at least as big as the largest frame that might be
2433 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2434 * but having it a bit too small is a problem. Since netdev_get_mtu()
2435 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2436 * the MTU. We actually add 64, instead of 14, as a guard against
2437 * additional headers get tacked on somewhere that we're not aware of. */
2438 netdev_get_mtu(netdev, &mtu);
2439 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2440 hc->burst = MAX(hc->burst, mtu + 64);
2443 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2449 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2450 unsigned int parent, struct htb_class *options,
2451 struct netdev_queue_stats *stats)
2453 struct ofpbuf *reply;
2456 error = tc_query_class(netdev, handle, parent, &reply);
2458 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2459 ofpbuf_delete(reply);
2465 htb_tc_install(struct netdev *netdev, const struct shash *details)
2469 error = htb_setup_qdisc__(netdev);
2471 struct htb_class hc;
2473 htb_parse_qdisc_details__(netdev, details, &hc);
2474 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2475 tc_make_handle(1, 0), &hc);
2477 htb_install__(netdev, hc.max_rate);
2483 static struct htb_class *
2484 htb_class_cast__(const struct tc_queue *queue)
2486 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2490 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2491 const struct htb_class *hc)
2493 struct htb *htb = htb_get__(netdev);
2494 size_t hash = hash_int(queue_id, 0);
2495 struct tc_queue *queue;
2496 struct htb_class *hcp;
2498 queue = tc_find_queue__(netdev, queue_id, hash);
2500 hcp = htb_class_cast__(queue);
2502 hcp = xmalloc(sizeof *hcp);
2503 queue = &hcp->tc_queue;
2504 queue->queue_id = queue_id;
2505 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2508 hcp->min_rate = hc->min_rate;
2509 hcp->max_rate = hc->max_rate;
2510 hcp->burst = hc->burst;
2511 hcp->priority = hc->priority;
2515 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2518 struct nl_dump dump;
2519 struct htb_class hc;
2522 /* Get qdisc options. */
2524 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2525 htb = htb_install__(netdev, hc.max_rate);
2528 if (!start_queue_dump(netdev, &dump)) {
2531 while (nl_dump_next(&dump, &msg)) {
2532 unsigned int queue_id;
2534 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2535 htb_update_queue__(netdev, queue_id, &hc);
2538 nl_dump_done(&dump);
2544 htb_tc_destroy(struct tc *tc)
2546 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2547 struct htb_class *hc, *next;
2549 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2550 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2558 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2560 const struct htb *htb = htb_get__(netdev);
2561 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2566 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2568 struct htb_class hc;
2571 htb_parse_qdisc_details__(netdev, details, &hc);
2572 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2573 tc_make_handle(1, 0), &hc);
2575 htb_get__(netdev)->max_rate = hc.max_rate;
2581 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2582 const struct tc_queue *queue, struct shash *details)
2584 const struct htb_class *hc = htb_class_cast__(queue);
2586 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2587 if (hc->min_rate != hc->max_rate) {
2588 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2590 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2592 shash_add(details, "priority", xasprintf("%u", hc->priority));
2598 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2599 const struct shash *details)
2601 struct htb_class hc;
2604 error = htb_parse_class_details__(netdev, details, &hc);
2609 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2610 tc_make_handle(1, 0xfffe), &hc);
2615 htb_update_queue__(netdev, queue_id, &hc);
2620 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2622 struct htb_class *hc = htb_class_cast__(queue);
2623 struct htb *htb = htb_get__(netdev);
2626 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2628 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2635 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2636 struct netdev_queue_stats *stats)
2638 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2639 tc_make_handle(1, 0xfffe), NULL, stats);
2643 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2644 const struct ofpbuf *nlmsg,
2645 netdev_dump_queue_stats_cb *cb, void *aux)
2647 struct netdev_queue_stats stats;
2648 unsigned int handle, major, minor;
2651 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2656 major = tc_get_major(handle);
2657 minor = tc_get_minor(handle);
2658 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2659 (*cb)(minor - 1, &stats, aux);
2664 static const struct tc_ops tc_ops_htb = {
2665 "htb", /* linux_name */
2666 "linux-htb", /* ovs_name */
2667 HTB_N_QUEUES, /* n_queues */
2676 htb_class_get_stats,
2677 htb_class_dump_stats
2680 /* "linux-hfsc" traffic control class. */
2682 #define HFSC_N_QUEUES 0xf000
2690 struct tc_queue tc_queue;
2695 static struct hfsc *
2696 hfsc_get__(const struct netdev *netdev)
2698 struct netdev_dev_linux *netdev_dev;
2699 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2700 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2703 static struct hfsc_class *
2704 hfsc_class_cast__(const struct tc_queue *queue)
2706 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2709 static struct hfsc *
2710 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2712 struct netdev_dev_linux * netdev_dev;
2715 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2716 hfsc = xmalloc(sizeof *hfsc);
2717 tc_init(&hfsc->tc, &tc_ops_hfsc);
2718 hfsc->max_rate = max_rate;
2719 netdev_dev->tc = &hfsc->tc;
2725 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2726 const struct hfsc_class *hc)
2730 struct hfsc_class *hcp;
2731 struct tc_queue *queue;
2733 hfsc = hfsc_get__(netdev);
2734 hash = hash_int(queue_id, 0);
2736 queue = tc_find_queue__(netdev, queue_id, hash);
2738 hcp = hfsc_class_cast__(queue);
2740 hcp = xmalloc(sizeof *hcp);
2741 queue = &hcp->tc_queue;
2742 queue->queue_id = queue_id;
2743 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2746 hcp->min_rate = hc->min_rate;
2747 hcp->max_rate = hc->max_rate;
2751 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2753 const struct tc_service_curve *rsc, *fsc, *usc;
2754 static const struct nl_policy tca_hfsc_policy[] = {
2756 .type = NL_A_UNSPEC,
2758 .min_len = sizeof(struct tc_service_curve),
2761 .type = NL_A_UNSPEC,
2763 .min_len = sizeof(struct tc_service_curve),
2766 .type = NL_A_UNSPEC,
2768 .min_len = sizeof(struct tc_service_curve),
2771 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2773 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2774 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2775 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2779 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2780 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2781 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2783 if (rsc->m1 != 0 || rsc->d != 0 ||
2784 fsc->m1 != 0 || fsc->d != 0 ||
2785 usc->m1 != 0 || usc->d != 0) {
2786 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2787 "Non-linear service curves are not supported.");
2791 if (rsc->m2 != fsc->m2) {
2792 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2793 "Real-time service curves are not supported ");
2797 if (rsc->m2 > usc->m2) {
2798 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2799 "Min-rate service curve is greater than "
2800 "the max-rate service curve.");
2804 class->min_rate = fsc->m2;
2805 class->max_rate = usc->m2;
2810 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2811 struct hfsc_class *options,
2812 struct netdev_queue_stats *stats)
2815 unsigned int handle;
2816 struct nlattr *nl_options;
2818 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2824 unsigned int major, minor;
2826 major = tc_get_major(handle);
2827 minor = tc_get_minor(handle);
2828 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2829 *queue_id = minor - 1;
2836 error = hfsc_parse_tca_options__(nl_options, options);
2843 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2844 unsigned int parent, struct hfsc_class *options,
2845 struct netdev_queue_stats *stats)
2848 struct ofpbuf *reply;
2850 error = tc_query_class(netdev, handle, parent, &reply);
2855 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2856 ofpbuf_delete(reply);
2861 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2862 struct hfsc_class *class)
2865 const char *max_rate_s;
2867 max_rate_s = shash_find_data(details, "max-rate");
2868 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2873 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2874 max_rate = netdev_features_to_bps(current) / 8;
2877 class->min_rate = max_rate;
2878 class->max_rate = max_rate;
2882 hfsc_parse_class_details__(struct netdev *netdev,
2883 const struct shash *details,
2884 struct hfsc_class * class)
2886 const struct hfsc *hfsc;
2887 uint32_t min_rate, max_rate;
2888 const char *min_rate_s, *max_rate_s;
2890 hfsc = hfsc_get__(netdev);
2891 min_rate_s = shash_find_data(details, "min-rate");
2892 max_rate_s = shash_find_data(details, "max-rate");
2898 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2899 min_rate = MAX(min_rate, 1500);
2900 min_rate = MIN(min_rate, hfsc->max_rate);
2902 max_rate = (max_rate_s
2903 ? strtoull(max_rate_s, NULL, 10) / 8
2905 max_rate = MAX(max_rate, min_rate);
2906 max_rate = MIN(max_rate, hfsc->max_rate);
2908 class->min_rate = min_rate;
2909 class->max_rate = max_rate;
2914 /* Create an HFSC qdisc.
2916 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2918 hfsc_setup_qdisc__(struct netdev * netdev)
2920 struct tcmsg *tcmsg;
2921 struct ofpbuf request;
2922 struct tc_hfsc_qopt opt;
2924 tc_del_qdisc(netdev);
2926 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2927 NLM_F_EXCL | NLM_F_CREATE, &request);
2933 tcmsg->tcm_handle = tc_make_handle(1, 0);
2934 tcmsg->tcm_parent = TC_H_ROOT;
2936 memset(&opt, 0, sizeof opt);
2939 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2940 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2942 return tc_transact(&request, NULL);
2945 /* Create an HFSC class.
2947 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2948 * sc rate <min_rate> ul rate <max_rate>" */
2950 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
2951 unsigned int parent, struct hfsc_class *class)
2955 struct tcmsg *tcmsg;
2956 struct ofpbuf request;
2957 struct tc_service_curve min, max;
2959 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2965 tcmsg->tcm_handle = handle;
2966 tcmsg->tcm_parent = parent;
2970 min.m2 = class->min_rate;
2974 max.m2 = class->max_rate;
2976 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2977 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2978 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
2979 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
2980 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
2981 nl_msg_end_nested(&request, opt_offset);
2983 error = tc_transact(&request, NULL);
2985 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2986 "min-rate %ubps, max-rate %ubps (%s)",
2987 netdev_get_name(netdev),
2988 tc_get_major(handle), tc_get_minor(handle),
2989 tc_get_major(parent), tc_get_minor(parent),
2990 class->min_rate, class->max_rate, strerror(error));
2997 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3000 struct hfsc_class class;
3002 error = hfsc_setup_qdisc__(netdev);
3008 hfsc_parse_qdisc_details__(netdev, details, &class);
3009 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3010 tc_make_handle(1, 0), &class);
3016 hfsc_install__(netdev, class.max_rate);
3021 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3025 struct nl_dump dump;
3026 struct hfsc_class hc;
3029 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3030 hfsc = hfsc_install__(netdev, hc.max_rate);
3032 if (!start_queue_dump(netdev, &dump)) {
3036 while (nl_dump_next(&dump, &msg)) {
3037 unsigned int queue_id;
3039 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3040 hfsc_update_queue__(netdev, queue_id, &hc);
3044 nl_dump_done(&dump);
3049 hfsc_tc_destroy(struct tc *tc)
3052 struct hfsc_class *hc, *next;
3054 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3056 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3057 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3066 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3068 const struct hfsc *hfsc;
3069 hfsc = hfsc_get__(netdev);
3070 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3075 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3078 struct hfsc_class class;
3080 hfsc_parse_qdisc_details__(netdev, details, &class);
3081 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3082 tc_make_handle(1, 0), &class);
3085 hfsc_get__(netdev)->max_rate = class.max_rate;
3092 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3093 const struct tc_queue *queue, struct shash *details)
3095 const struct hfsc_class *hc;
3097 hc = hfsc_class_cast__(queue);
3098 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3099 if (hc->min_rate != hc->max_rate) {
3100 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3106 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3107 const struct shash *details)
3110 struct hfsc_class class;
3112 error = hfsc_parse_class_details__(netdev, details, &class);
3117 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3118 tc_make_handle(1, 0xfffe), &class);
3123 hfsc_update_queue__(netdev, queue_id, &class);
3128 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3132 struct hfsc_class *hc;
3134 hc = hfsc_class_cast__(queue);
3135 hfsc = hfsc_get__(netdev);
3137 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3139 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3146 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3147 struct netdev_queue_stats *stats)
3149 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3150 tc_make_handle(1, 0xfffe), NULL, stats);
3154 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3155 const struct ofpbuf *nlmsg,
3156 netdev_dump_queue_stats_cb *cb, void *aux)
3158 struct netdev_queue_stats stats;
3159 unsigned int handle, major, minor;
3162 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3167 major = tc_get_major(handle);
3168 minor = tc_get_minor(handle);
3169 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3170 (*cb)(minor - 1, &stats, aux);
3175 static const struct tc_ops tc_ops_hfsc = {
3176 "hfsc", /* linux_name */
3177 "linux-hfsc", /* ovs_name */
3178 HFSC_N_QUEUES, /* n_queues */
3179 hfsc_tc_install, /* tc_install */
3180 hfsc_tc_load, /* tc_load */
3181 hfsc_tc_destroy, /* tc_destroy */
3182 hfsc_qdisc_get, /* qdisc_get */
3183 hfsc_qdisc_set, /* qdisc_set */
3184 hfsc_class_get, /* class_get */
3185 hfsc_class_set, /* class_set */
3186 hfsc_class_delete, /* class_delete */
3187 hfsc_class_get_stats, /* class_get_stats */
3188 hfsc_class_dump_stats /* class_dump_stats */
3191 /* "linux-default" traffic control class.
3193 * This class represents the default, unnamed Linux qdisc. It corresponds to
3194 * the "" (empty string) QoS type in the OVS database. */
3197 default_install__(struct netdev *netdev)
3199 struct netdev_dev_linux *netdev_dev =
3200 netdev_dev_linux_cast(netdev_get_dev(netdev));
3201 static struct tc *tc;
3204 tc = xmalloc(sizeof *tc);
3205 tc_init(tc, &tc_ops_default);
3207 netdev_dev->tc = tc;
3211 default_tc_install(struct netdev *netdev,
3212 const struct shash *details OVS_UNUSED)
3214 default_install__(netdev);
3219 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3221 default_install__(netdev);
3225 static const struct tc_ops tc_ops_default = {
3226 NULL, /* linux_name */
3231 NULL, /* tc_destroy */
3232 NULL, /* qdisc_get */
3233 NULL, /* qdisc_set */
3234 NULL, /* class_get */
3235 NULL, /* class_set */
3236 NULL, /* class_delete */
3237 NULL, /* class_get_stats */
3238 NULL /* class_dump_stats */
3241 /* "linux-other" traffic control class.
3246 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3248 struct netdev_dev_linux *netdev_dev =
3249 netdev_dev_linux_cast(netdev_get_dev(netdev));
3250 static struct tc *tc;
3253 tc = xmalloc(sizeof *tc);
3254 tc_init(tc, &tc_ops_other);
3256 netdev_dev->tc = tc;
3260 static const struct tc_ops tc_ops_other = {
3261 NULL, /* linux_name */
3262 "linux-other", /* ovs_name */
3264 NULL, /* tc_install */
3266 NULL, /* tc_destroy */
3267 NULL, /* qdisc_get */
3268 NULL, /* qdisc_set */
3269 NULL, /* class_get */
3270 NULL, /* class_set */
3271 NULL, /* class_delete */
3272 NULL, /* class_get_stats */
3273 NULL /* class_dump_stats */
3276 /* Traffic control. */
3278 /* Number of kernel "tc" ticks per second. */
3279 static double ticks_per_s;
3281 /* Number of kernel "jiffies" per second. This is used for the purpose of
3282 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3283 * one jiffy's worth of data.
3285 * There are two possibilities here:
3287 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3288 * approximate range of 100 to 1024. That means that we really need to
3289 * make sure that the qdisc can buffer that much data.
3291 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3292 * has finely granular timers and there's no need to fudge additional room
3293 * for buffers. (There's no extra effort needed to implement that: the
3294 * large 'buffer_hz' is used as a divisor, so practically any number will
3295 * come out as 0 in the division. Small integer results in the case of
3296 * really high dividends won't have any real effect anyhow.)
3298 static unsigned int buffer_hz;
3300 /* Returns tc handle 'major':'minor'. */
3302 tc_make_handle(unsigned int major, unsigned int minor)
3304 return TC_H_MAKE(major << 16, minor);
3307 /* Returns the major number from 'handle'. */
3309 tc_get_major(unsigned int handle)
3311 return TC_H_MAJ(handle) >> 16;
3314 /* Returns the minor number from 'handle'. */
3316 tc_get_minor(unsigned int handle)
3318 return TC_H_MIN(handle);
3321 static struct tcmsg *
3322 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3323 struct ofpbuf *request)
3325 struct tcmsg *tcmsg;
3329 error = get_ifindex(netdev, &ifindex);
3334 ofpbuf_init(request, 512);
3335 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3336 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3337 tcmsg->tcm_family = AF_UNSPEC;
3338 tcmsg->tcm_ifindex = ifindex;
3339 /* Caller should fill in tcmsg->tcm_handle. */
3340 /* Caller should fill in tcmsg->tcm_parent. */
3346 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3348 int error = nl_sock_transact(rtnl_sock, request, replyp);
3349 ofpbuf_uninit(request);
3356 /* The values in psched are not individually very meaningful, but they are
3357 * important. The tables below show some values seen in the wild.
3361 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3362 * (Before that, there are hints that it was 1000000000.)
3364 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3368 * -----------------------------------
3369 * [1] 000c8000 000f4240 000f4240 00000064
3370 * [2] 000003e8 00000400 000f4240 3b9aca00
3371 * [3] 000003e8 00000400 000f4240 3b9aca00
3372 * [4] 000003e8 00000400 000f4240 00000064
3373 * [5] 000003e8 00000040 000f4240 3b9aca00
3374 * [6] 000003e8 00000040 000f4240 000000f9
3376 * a b c d ticks_per_s buffer_hz
3377 * ------- --------- ---------- ------------- ----------- -------------
3378 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3379 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3380 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3381 * [4] 1,000 1,024 1,000,000 100 976,562 100
3382 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3383 * [6] 1,000 64 1,000,000 249 15,625,000 249
3385 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3386 * [2] 2.6.26-1-686-bigmem from Debian lenny
3387 * [3] 2.6.26-2-sparc64 from Debian lenny
3388 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3389 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3390 * [6] 2.6.34 from kernel.org on KVM
3392 static const char fn[] = "/proc/net/psched";
3393 unsigned int a, b, c, d;
3399 stream = fopen(fn, "r");
3401 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3405 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3406 VLOG_WARN("%s: read failed", fn);
3410 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3414 VLOG_WARN("%s: invalid scheduler parameters", fn);
3418 ticks_per_s = (double) a * c / b;
3422 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3425 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3428 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3429 * rate of 'rate' bytes per second. */
3431 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3436 return (rate * ticks) / ticks_per_s;
3439 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3440 * rate of 'rate' bytes per second. */
3442 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3447 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3450 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3451 * a transmission rate of 'rate' bytes per second. */
3453 tc_buffer_per_jiffy(unsigned int rate)
3458 return rate / buffer_hz;
3461 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3462 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3463 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3464 * stores NULL into it if it is absent.
3466 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3469 * Returns 0 if successful, otherwise a positive errno value. */
3471 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3472 struct nlattr **options)
3474 static const struct nl_policy tca_policy[] = {
3475 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3476 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3478 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3480 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3481 tca_policy, ta, ARRAY_SIZE(ta))) {
3482 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3487 *kind = nl_attr_get_string(ta[TCA_KIND]);
3491 *options = ta[TCA_OPTIONS];
3506 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3507 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3508 * into '*options', and its queue statistics into '*stats'. Any of the output
3509 * arguments may be null.
3511 * Returns 0 if successful, otherwise a positive errno value. */
3513 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3514 struct nlattr **options, struct netdev_queue_stats *stats)
3516 static const struct nl_policy tca_policy[] = {
3517 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3518 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3520 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3522 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3523 tca_policy, ta, ARRAY_SIZE(ta))) {
3524 VLOG_WARN_RL(&rl, "failed to parse class message");
3529 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3530 *handlep = tc->tcm_handle;
3534 *options = ta[TCA_OPTIONS];
3538 const struct gnet_stats_queue *gsq;
3539 struct gnet_stats_basic gsb;
3541 static const struct nl_policy stats_policy[] = {
3542 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3543 .min_len = sizeof gsb },
3544 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3545 .min_len = sizeof *gsq },
3547 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3549 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3550 sa, ARRAY_SIZE(sa))) {
3551 VLOG_WARN_RL(&rl, "failed to parse class stats");
3555 /* Alignment issues screw up the length of struct gnet_stats_basic on
3556 * some arch/bitsize combinations. Newer versions of Linux have a
3557 * struct gnet_stats_basic_packed, but we can't depend on that. The
3558 * easiest thing to do is just to make a copy. */
3559 memset(&gsb, 0, sizeof gsb);
3560 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3561 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3562 stats->tx_bytes = gsb.bytes;
3563 stats->tx_packets = gsb.packets;
3565 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3566 stats->tx_errors = gsq->drops;
3576 memset(stats, 0, sizeof *stats);
3581 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3584 tc_query_class(const struct netdev *netdev,
3585 unsigned int handle, unsigned int parent,
3586 struct ofpbuf **replyp)
3588 struct ofpbuf request;
3589 struct tcmsg *tcmsg;
3592 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3596 tcmsg->tcm_handle = handle;
3597 tcmsg->tcm_parent = parent;
3599 error = tc_transact(&request, replyp);
3601 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3602 netdev_get_name(netdev),
3603 tc_get_major(handle), tc_get_minor(handle),
3604 tc_get_major(parent), tc_get_minor(parent),
3610 /* Equivalent to "tc class del dev <name> handle <handle>". */
3612 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3614 struct ofpbuf request;
3615 struct tcmsg *tcmsg;
3618 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3622 tcmsg->tcm_handle = handle;
3623 tcmsg->tcm_parent = 0;
3625 error = tc_transact(&request, NULL);
3627 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3628 netdev_get_name(netdev),
3629 tc_get_major(handle), tc_get_minor(handle),
3635 /* Equivalent to "tc qdisc del dev <name> root". */
3637 tc_del_qdisc(struct netdev *netdev)
3639 struct netdev_dev_linux *netdev_dev =
3640 netdev_dev_linux_cast(netdev_get_dev(netdev));
3641 struct ofpbuf request;
3642 struct tcmsg *tcmsg;
3645 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3649 tcmsg->tcm_handle = tc_make_handle(1, 0);
3650 tcmsg->tcm_parent = TC_H_ROOT;
3652 error = tc_transact(&request, NULL);
3653 if (error == EINVAL) {
3654 /* EINVAL probably means that the default qdisc was in use, in which
3655 * case we've accomplished our purpose. */
3658 if (!error && netdev_dev->tc) {
3659 if (netdev_dev->tc->ops->tc_destroy) {
3660 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3662 netdev_dev->tc = NULL;
3667 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3668 * kernel to determine what they are. Returns 0 if successful, otherwise a
3669 * positive errno value. */
3671 tc_query_qdisc(const struct netdev *netdev)
3673 struct netdev_dev_linux *netdev_dev =
3674 netdev_dev_linux_cast(netdev_get_dev(netdev));
3675 struct ofpbuf request, *qdisc;
3676 const struct tc_ops *ops;
3677 struct tcmsg *tcmsg;
3681 if (netdev_dev->tc) {
3685 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3686 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3687 * 2.6.35 without that fix backported to it.
3689 * To avoid the OOPS, we must not make a request that would attempt to dump
3690 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3691 * few others. There are a few ways that I can see to do this, but most of
3692 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3693 * technique chosen here is to assume that any non-default qdisc that we
3694 * create will have a class with handle 1:0. The built-in qdiscs only have
3695 * a class with handle 0:0.
3697 * We could check for Linux 2.6.35+ and use a more straightforward method
3699 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3703 tcmsg->tcm_handle = tc_make_handle(1, 0);
3704 tcmsg->tcm_parent = 0;
3706 /* Figure out what tc class to instantiate. */
3707 error = tc_transact(&request, &qdisc);
3711 error = tc_parse_qdisc(qdisc, &kind, NULL);
3713 ops = &tc_ops_other;
3715 ops = tc_lookup_linux_name(kind);
3717 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3718 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3720 ops = &tc_ops_other;
3723 } else if (error == ENOENT) {
3724 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3725 * other entity that doesn't have a handle 1:0. We will assume
3726 * that it's the system default qdisc. */
3727 ops = &tc_ops_default;
3730 /* Who knows? Maybe the device got deleted. */
3731 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3732 netdev_get_name(netdev), strerror(error));
3733 ops = &tc_ops_other;
3736 /* Instantiate it. */
3737 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3738 assert((load_error == 0) == (netdev_dev->tc != NULL));
3739 ofpbuf_delete(qdisc);
3741 return error ? error : load_error;
3744 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3745 approximate the time to transmit packets of various lengths. For an MTU of
3746 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3747 represents two possible packet lengths; for a MTU of 513 through 1024, four
3748 possible lengths; and so on.
3750 Returns, for the specified 'mtu', the number of bits that packet lengths
3751 need to be shifted right to fit within such a 256-entry table. */
3753 tc_calc_cell_log(unsigned int mtu)
3758 mtu = ETH_PAYLOAD_MAX;
3760 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3762 for (cell_log = 0; mtu >= 256; cell_log++) {
3769 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3772 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3774 memset(rate, 0, sizeof *rate);
3775 rate->cell_log = tc_calc_cell_log(mtu);
3776 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3777 /* rate->cell_align = 0; */ /* distro headers. */
3778 rate->mpu = ETH_TOTAL_MIN;
3782 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3783 * attribute of the specified "type".
3785 * See tc_calc_cell_log() above for a description of "rtab"s. */
3787 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3792 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3793 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3794 unsigned packet_size = (i + 1) << rate->cell_log;
3795 if (packet_size < rate->mpu) {
3796 packet_size = rate->mpu;
3798 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3802 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3803 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3804 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3807 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3809 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3810 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3814 /* Utility functions. */
3817 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3819 /* Policy for RTNLGRP_LINK messages.
3821 * There are *many* more fields in these messages, but currently we only
3822 * care about these fields. */
3823 static const struct nl_policy rtnlgrp_link_policy[] = {
3824 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3825 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3826 .min_len = sizeof(struct rtnl_link_stats) },
3829 struct ofpbuf request;
3830 struct ofpbuf *reply;
3831 struct ifinfomsg *ifi;
3832 const struct rtnl_link_stats *rtnl_stats;
3833 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3836 ofpbuf_init(&request, 0);
3837 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3838 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3839 ifi->ifi_family = PF_UNSPEC;
3840 ifi->ifi_index = ifindex;
3841 error = nl_sock_transact(rtnl_sock, &request, &reply);
3842 ofpbuf_uninit(&request);
3847 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3848 rtnlgrp_link_policy,
3849 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3850 ofpbuf_delete(reply);
3854 if (!attrs[IFLA_STATS]) {
3855 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3856 ofpbuf_delete(reply);
3860 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3861 stats->rx_packets = rtnl_stats->rx_packets;
3862 stats->tx_packets = rtnl_stats->tx_packets;
3863 stats->rx_bytes = rtnl_stats->rx_bytes;
3864 stats->tx_bytes = rtnl_stats->tx_bytes;
3865 stats->rx_errors = rtnl_stats->rx_errors;
3866 stats->tx_errors = rtnl_stats->tx_errors;
3867 stats->rx_dropped = rtnl_stats->rx_dropped;
3868 stats->tx_dropped = rtnl_stats->tx_dropped;
3869 stats->multicast = rtnl_stats->multicast;
3870 stats->collisions = rtnl_stats->collisions;
3871 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3872 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3873 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3874 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3875 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3876 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3877 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3878 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3879 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3880 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3881 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3883 ofpbuf_delete(reply);
3889 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3891 static const char fn[] = "/proc/net/dev";
3896 stream = fopen(fn, "r");
3898 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3903 while (fgets(line, sizeof line, stream)) {
3906 #define X64 "%"SCNu64
3909 X64 X64 X64 X64 X64 X64 X64 "%*u"
3910 X64 X64 X64 X64 X64 X64 X64 "%*u",
3916 &stats->rx_fifo_errors,
3917 &stats->rx_frame_errors,
3923 &stats->tx_fifo_errors,
3925 &stats->tx_carrier_errors) != 15) {
3926 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3927 } else if (!strcmp(devname, netdev_name)) {
3928 stats->rx_length_errors = UINT64_MAX;
3929 stats->rx_over_errors = UINT64_MAX;
3930 stats->rx_crc_errors = UINT64_MAX;
3931 stats->rx_missed_errors = UINT64_MAX;
3932 stats->tx_aborted_errors = UINT64_MAX;
3933 stats->tx_heartbeat_errors = UINT64_MAX;
3934 stats->tx_window_errors = UINT64_MAX;
3940 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3946 get_flags(const struct netdev *netdev, int *flags)
3951 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3953 *flags = ifr.ifr_flags;
3958 set_flags(struct netdev *netdev, int flags)
3962 ifr.ifr_flags = flags;
3963 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3968 do_get_ifindex(const char *netdev_name)
3972 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3973 COVERAGE_INC(netdev_get_ifindex);
3974 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3975 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3976 netdev_name, strerror(errno));
3979 return ifr.ifr_ifindex;
3983 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3985 struct netdev_dev_linux *netdev_dev =
3986 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3988 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3989 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3993 netdev_dev->cache_valid |= VALID_IFINDEX;
3994 netdev_dev->ifindex = ifindex;
3996 *ifindexp = netdev_dev->ifindex;
4001 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4006 memset(&ifr, 0, sizeof ifr);
4007 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4008 COVERAGE_INC(netdev_get_hwaddr);
4009 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4010 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4011 netdev_name, strerror(errno));
4014 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4015 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4016 VLOG_WARN("%s device has unknown hardware address family %d",
4017 netdev_name, hwaddr_family);
4019 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4024 set_etheraddr(const char *netdev_name, int hwaddr_family,
4025 const uint8_t mac[ETH_ADDR_LEN])
4029 memset(&ifr, 0, sizeof ifr);
4030 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4031 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4032 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4033 COVERAGE_INC(netdev_set_hwaddr);
4034 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4035 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4036 netdev_name, strerror(errno));
4043 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4044 int cmd, const char *cmd_name)
4048 memset(&ifr, 0, sizeof ifr);
4049 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4050 ifr.ifr_data = (caddr_t) ecmd;
4053 COVERAGE_INC(netdev_ethtool);
4054 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4057 if (errno != EOPNOTSUPP) {
4058 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4059 "failed: %s", cmd_name, name, strerror(errno));
4061 /* The device doesn't support this operation. That's pretty
4062 * common, so there's no point in logging anything. */
4069 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4070 const char *cmd_name)
4072 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4073 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4074 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4082 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4083 int cmd, const char *cmd_name)
4088 ifr.ifr_addr.sa_family = AF_INET;
4089 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4091 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4092 *ip = sin->sin_addr;