2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
57 #include "openflow/openflow.h"
59 #include "poll-loop.h"
60 #include "rtnetlink.h"
61 #include "socket-util.h"
66 VLOG_DEFINE_THIS_MODULE(netdev_linux);
68 COVERAGE_DEFINE(netdev_get_vlan_vid);
69 COVERAGE_DEFINE(netdev_set_policing);
70 COVERAGE_DEFINE(netdev_arp_lookup);
71 COVERAGE_DEFINE(netdev_get_ifindex);
72 COVERAGE_DEFINE(netdev_get_hwaddr);
73 COVERAGE_DEFINE(netdev_set_hwaddr);
74 COVERAGE_DEFINE(netdev_ethtool);
76 /* These were introduced in Linux 2.6.14, so they might be missing if we have
78 #ifndef ADVERTISED_Pause
79 #define ADVERTISED_Pause (1 << 13)
81 #ifndef ADVERTISED_Asym_Pause
82 #define ADVERTISED_Asym_Pause (1 << 14)
85 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
88 #define TC_RTAB_SIZE 1024
91 static struct rtnetlink_notifier netdev_linux_cache_notifier;
92 static int cache_notifier_refcount;
95 VALID_IFINDEX = 1 << 0,
96 VALID_ETHERADDR = 1 << 1,
100 VALID_CARRIER = 1 << 5,
101 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
102 VALID_POLICING = 1 << 7,
103 VALID_HAVE_VPORT_STATS = 1 << 8
111 /* Traffic control. */
113 /* An instance of a traffic control class. Always associated with a particular
116 * Each TC implementation subclasses this with whatever additional data it
119 const struct tc_ops *ops;
120 struct hmap queues; /* Contains "struct tc_queue"s.
121 * Read by generic TC layer.
122 * Written only by TC implementation. */
125 /* One traffic control queue.
127 * Each TC implementation subclasses this with whatever additional data it
130 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
131 unsigned int queue_id; /* OpenFlow queue ID. */
134 /* A particular kind of traffic control. Each implementation generally maps to
135 * one particular Linux qdisc class.
137 * The functions below return 0 if successful or a positive errno value on
138 * failure, except where otherwise noted. All of them must be provided, except
139 * where otherwise noted. */
141 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
142 * This is null for tc_ops_default and tc_ops_other, for which there are no
143 * appropriate values. */
144 const char *linux_name;
146 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
147 const char *ovs_name;
149 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
150 * queues. The queues are numbered 0 through n_queues - 1. */
151 unsigned int n_queues;
153 /* Called to install this TC class on 'netdev'. The implementation should
154 * make the Netlink calls required to set up 'netdev' with the right qdisc
155 * and configure it according to 'details'. The implementation may assume
156 * that the current qdisc is the default; that is, there is no need for it
157 * to delete the current qdisc before installing itself.
159 * The contents of 'details' should be documented as valid for 'ovs_name'
160 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
161 * (which is built as ovs-vswitchd.conf.db(8)).
163 * This function must return 0 if and only if it sets 'netdev->tc' to an
164 * initialized 'struct tc'.
166 * (This function is null for tc_ops_other, which cannot be installed. For
167 * other TC classes it should always be nonnull.) */
168 int (*tc_install)(struct netdev *netdev, const struct shash *details);
170 /* Called when the netdev code determines (through a Netlink query) that
171 * this TC class's qdisc is installed on 'netdev', but we didn't install
172 * it ourselves and so don't know any of the details.
174 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
175 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
176 * implementation should parse the other attributes of 'nlmsg' as
177 * necessary to determine its configuration. If necessary it should also
178 * use Netlink queries to determine the configuration of queues on
181 * This function must return 0 if and only if it sets 'netdev->tc' to an
182 * initialized 'struct tc'. */
183 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
185 /* Destroys the data structures allocated by the implementation as part of
186 * 'tc'. (This includes destroying 'tc->queues' by calling
189 * The implementation should not need to perform any Netlink calls. If
190 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
191 * (But it may not be desirable.)
193 * This function may be null if 'tc' is trivial. */
194 void (*tc_destroy)(struct tc *tc);
196 /* Retrieves details of 'netdev->tc' configuration into 'details'.
198 * The implementation should not need to perform any Netlink calls, because
199 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
200 * cached the configuration.
202 * The contents of 'details' should be documented as valid for 'ovs_name'
203 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
204 * (which is built as ovs-vswitchd.conf.db(8)).
206 * This function may be null if 'tc' is not configurable.
208 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
210 /* Reconfigures 'netdev->tc' according to 'details', performing any
211 * required Netlink calls to complete the reconfiguration.
213 * The contents of 'details' should be documented as valid for 'ovs_name'
214 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
215 * (which is built as ovs-vswitchd.conf.db(8)).
217 * This function may be null if 'tc' is not configurable.
219 int (*qdisc_set)(struct netdev *, const struct shash *details);
221 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
222 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
224 * The contents of 'details' should be documented as valid for 'ovs_name'
225 * in the "other_config" column in the "Queue" table in
226 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
228 * The implementation should not need to perform any Netlink calls, because
229 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
230 * cached the queue configuration.
232 * This function may be null if 'tc' does not have queues ('n_queues' is
234 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
235 struct shash *details);
237 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
238 * 'details', perfoming any required Netlink calls to complete the
239 * reconfiguration. The caller ensures that 'queue_id' is less than
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "Queue" table in
244 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
246 * This function may be null if 'tc' does not have queues or its queues are
247 * not configurable. */
248 int (*class_set)(struct netdev *, unsigned int queue_id,
249 const struct shash *details);
251 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
252 * tc_queue's within 'netdev->tc->queues'.
254 * This function may be null if 'tc' does not have queues or its queues
255 * cannot be deleted. */
256 int (*class_delete)(struct netdev *, struct tc_queue *queue);
258 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
259 * 'struct tc_queue's within 'netdev->tc->queues'.
261 * On success, initializes '*stats'.
263 * This function may be null if 'tc' does not have queues or if it cannot
264 * report queue statistics. */
265 int (*class_get_stats)(const struct netdev *netdev,
266 const struct tc_queue *queue,
267 struct netdev_queue_stats *stats);
269 /* Extracts queue stats from 'nlmsg', which is a response to a
270 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
272 * This function may be null if 'tc' does not have queues or if it cannot
273 * report queue statistics. */
274 int (*class_dump_stats)(const struct netdev *netdev,
275 const struct ofpbuf *nlmsg,
276 netdev_dump_queue_stats_cb *cb, void *aux);
280 tc_init(struct tc *tc, const struct tc_ops *ops)
283 hmap_init(&tc->queues);
287 tc_destroy(struct tc *tc)
289 hmap_destroy(&tc->queues);
292 static const struct tc_ops tc_ops_htb;
293 static const struct tc_ops tc_ops_hfsc;
294 static const struct tc_ops tc_ops_default;
295 static const struct tc_ops tc_ops_other;
297 static const struct tc_ops *tcs[] = {
298 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
299 &tc_ops_hfsc, /* Hierarchical fair service curve. */
300 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
301 &tc_ops_other, /* Some other qdisc. */
305 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
306 static unsigned int tc_get_major(unsigned int handle);
307 static unsigned int tc_get_minor(unsigned int handle);
309 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
310 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
311 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
313 static struct tcmsg *tc_make_request(const struct netdev *, int type,
314 unsigned int flags, struct ofpbuf *);
315 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
317 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
318 struct nlattr **options);
319 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
320 struct nlattr **options,
321 struct netdev_queue_stats *);
322 static int tc_query_class(const struct netdev *,
323 unsigned int handle, unsigned int parent,
324 struct ofpbuf **replyp);
325 static int tc_delete_class(const struct netdev *, unsigned int handle);
327 static int tc_del_qdisc(struct netdev *netdev);
328 static int tc_query_qdisc(const struct netdev *netdev);
330 static int tc_calc_cell_log(unsigned int mtu);
331 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
332 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
333 const struct tc_ratespec *rate);
334 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
336 struct netdev_dev_linux {
337 struct netdev_dev netdev_dev;
339 struct shash_node *shash_node;
340 unsigned int cache_valid;
342 /* The following are figured out "on demand" only. They are only valid
343 * when the corresponding VALID_* bit in 'cache_valid' is set. */
345 uint8_t etheraddr[ETH_ADDR_LEN];
346 struct in_addr address, netmask;
350 bool is_internal; /* Is this an openvswitch internal device? */
351 bool is_tap; /* Is this a tuntap device? */
352 uint32_t kbits_rate; /* Policing data. */
353 uint32_t kbits_burst;
354 bool have_vport_stats;
358 struct tap_state tap;
362 struct netdev_linux {
363 struct netdev netdev;
367 /* An AF_INET socket (used for ioctl operations). */
368 static int af_inet_sock = -1;
370 /* A Netlink routing socket that is not subscribed to any multicast groups. */
371 static struct nl_sock *rtnl_sock;
373 struct netdev_linux_notifier {
374 struct netdev_notifier notifier;
378 static struct shash netdev_linux_notifiers =
379 SHASH_INITIALIZER(&netdev_linux_notifiers);
380 static struct rtnetlink_notifier netdev_linux_poll_notifier;
382 /* This is set pretty low because we probably won't learn anything from the
383 * additional log messages. */
384 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
386 static int netdev_linux_init(void);
388 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
389 int cmd, const char *cmd_name);
390 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
391 const char *cmd_name);
392 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
393 int cmd, const char *cmd_name);
394 static int get_flags(const struct netdev *, int *flagsp);
395 static int set_flags(struct netdev *, int flags);
396 static int do_get_ifindex(const char *netdev_name);
397 static int get_ifindex(const struct netdev *, int *ifindexp);
398 static int do_set_addr(struct netdev *netdev,
399 int ioctl_nr, const char *ioctl_name,
400 struct in_addr addr);
401 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
402 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
403 const uint8_t[ETH_ADDR_LEN]);
404 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
405 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
408 is_netdev_linux_class(const struct netdev_class *netdev_class)
410 return netdev_class->init == netdev_linux_init;
413 static struct netdev_dev_linux *
414 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
416 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
417 assert(is_netdev_linux_class(netdev_class));
419 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
422 static struct netdev_linux *
423 netdev_linux_cast(const struct netdev *netdev)
425 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
426 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
427 assert(is_netdev_linux_class(netdev_class));
429 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
433 netdev_linux_init(void)
435 static int status = -1;
437 /* Create AF_INET socket. */
438 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
439 status = af_inet_sock >= 0 ? 0 : errno;
441 VLOG_ERR("failed to create inet socket: %s", strerror(status));
444 /* Create rtnetlink socket. */
446 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
448 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
457 netdev_linux_run(void)
459 rtnetlink_notifier_run();
463 netdev_linux_wait(void)
465 rtnetlink_notifier_wait();
469 netdev_linux_cache_cb(const struct rtnetlink_change *change,
470 void *aux OVS_UNUSED)
472 struct netdev_dev_linux *dev;
474 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
476 const struct netdev_class *netdev_class =
477 netdev_dev_get_class(base_dev);
479 if (is_netdev_linux_class(netdev_class)) {
480 dev = netdev_dev_linux_cast(base_dev);
481 dev->cache_valid = 0;
485 struct shash device_shash;
486 struct shash_node *node;
488 shash_init(&device_shash);
489 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
490 SHASH_FOR_EACH (node, &device_shash) {
492 dev->cache_valid = 0;
494 shash_destroy(&device_shash);
498 /* Creates the netdev device of 'type' with 'name'. */
500 netdev_linux_create_system(const struct netdev_class *class OVS_UNUSED,
501 const char *name, const struct shash *args,
502 struct netdev_dev **netdev_devp)
504 struct netdev_dev_linux *netdev_dev;
507 if (!shash_is_empty(args)) {
508 VLOG_WARN("%s: arguments for system devices should be empty", name);
511 if (!cache_notifier_refcount) {
512 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
513 netdev_linux_cache_cb, NULL);
518 cache_notifier_refcount++;
520 netdev_dev = xzalloc(sizeof *netdev_dev);
521 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
523 *netdev_devp = &netdev_dev->netdev_dev;
527 /* For most types of netdevs we open the device for each call of
528 * netdev_open(). However, this is not the case with tap devices,
529 * since it is only possible to open the device once. In this
530 * situation we share a single file descriptor, and consequently
531 * buffers, across all readers. Therefore once data is read it will
532 * be unavailable to other reads for tap devices. */
534 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
535 const char *name, const struct shash *args,
536 struct netdev_dev **netdev_devp)
538 struct netdev_dev_linux *netdev_dev;
539 struct tap_state *state;
540 static const char tap_dev[] = "/dev/net/tun";
544 if (!shash_is_empty(args)) {
545 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
548 netdev_dev = xzalloc(sizeof *netdev_dev);
549 state = &netdev_dev->state.tap;
551 /* Open tap device. */
552 state->fd = open(tap_dev, O_RDWR);
555 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
559 /* Create tap device. */
560 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
561 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
562 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
563 VLOG_WARN("%s: creating tap device failed: %s", name,
569 /* Make non-blocking. */
570 error = set_nonblocking(state->fd);
575 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
576 *netdev_devp = &netdev_dev->netdev_dev;
585 destroy_tap(struct netdev_dev_linux *netdev_dev)
587 struct tap_state *state = &netdev_dev->state.tap;
589 if (state->fd >= 0) {
594 /* Destroys the netdev device 'netdev_dev_'. */
596 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
598 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
599 const char *type = netdev_dev_get_type(netdev_dev_);
601 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
602 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
605 if (!strcmp(type, "system")) {
606 cache_notifier_refcount--;
608 if (!cache_notifier_refcount) {
609 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
611 } else if (!strcmp(type, "tap")) {
612 destroy_tap(netdev_dev);
619 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
620 struct netdev **netdevp)
622 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
623 struct netdev_linux *netdev;
624 enum netdev_flags flags;
627 /* Allocate network device. */
628 netdev = xzalloc(sizeof *netdev);
630 netdev_init(&netdev->netdev, netdev_dev_);
632 error = netdev_get_flags(&netdev->netdev, &flags);
633 if (error == ENODEV) {
637 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
638 !netdev_dev->state.tap.opened) {
640 /* We assume that the first user of the tap device is the primary user
641 * and give them the tap FD. Subsequent users probably just expect
642 * this to be a system device so open it normally to avoid send/receive
643 * directions appearing to be reversed. */
644 netdev->fd = netdev_dev->state.tap.fd;
645 netdev_dev->state.tap.opened = true;
646 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
647 struct sockaddr_ll sll;
651 /* Create file descriptor. */
652 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
653 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
655 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
656 if (netdev->fd < 0) {
661 /* Set non-blocking mode. */
662 error = set_nonblocking(netdev->fd);
667 /* Get ethernet device index. */
668 error = get_ifindex(&netdev->netdev, &ifindex);
673 /* Bind to specific ethernet device. */
674 memset(&sll, 0, sizeof sll);
675 sll.sll_family = AF_PACKET;
676 sll.sll_ifindex = ifindex;
678 (struct sockaddr *) &sll, sizeof sll) < 0) {
680 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
685 /* Between the socket() and bind() calls above, the socket receives all
686 * packets of the requested type on all system interfaces. We do not
687 * want to receive that data, but there is no way to avoid it. So we
688 * must now drain out the receive queue. */
689 error = drain_rcvbuf(netdev->fd);
695 *netdevp = &netdev->netdev;
699 netdev_uninit(&netdev->netdev, true);
703 /* Closes and destroys 'netdev'. */
705 netdev_linux_close(struct netdev *netdev_)
707 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
709 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
715 /* Initializes 'svec' with a list of the names of all known network devices. */
717 netdev_linux_enumerate(struct svec *svec)
719 struct if_nameindex *names;
721 names = if_nameindex();
725 for (i = 0; names[i].if_name != NULL; i++) {
726 svec_add(svec, names[i].if_name);
728 if_freenameindex(names);
731 VLOG_WARN("could not obtain list of network device names: %s",
738 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
740 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
742 if (netdev->fd < 0) {
743 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
748 ssize_t retval = read(netdev->fd, data, size);
751 } else if (errno != EINTR) {
752 if (errno != EAGAIN) {
753 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
754 strerror(errno), netdev_get_name(netdev_));
761 /* Registers with the poll loop to wake up from the next call to poll_block()
762 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
764 netdev_linux_recv_wait(struct netdev *netdev_)
766 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
767 if (netdev->fd >= 0) {
768 poll_fd_wait(netdev->fd, POLLIN);
772 /* Discards all packets waiting to be received from 'netdev'. */
774 netdev_linux_drain(struct netdev *netdev_)
776 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
777 if (netdev->fd < 0) {
779 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
781 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
782 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
786 drain_fd(netdev->fd, ifr.ifr_qlen);
789 return drain_rcvbuf(netdev->fd);
793 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
794 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
795 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
796 * the packet is too big or too small to transmit on the device.
798 * The caller retains ownership of 'buffer' in all cases.
800 * The kernel maintains a packet transmission queue, so the caller is not
801 * expected to do additional queuing of packets. */
803 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
805 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
807 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
809 if (netdev->fd < 0) {
814 ssize_t retval = write(netdev->fd, data, size);
816 /* The Linux AF_PACKET implementation never blocks waiting for room
817 * for packets, instead returning ENOBUFS. Translate this into
818 * EAGAIN for the caller. */
819 if (errno == ENOBUFS) {
821 } else if (errno == EINTR) {
823 } else if (errno != EAGAIN) {
824 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
825 netdev_get_name(netdev_), strerror(errno));
828 } else if (retval != size) {
829 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
830 "%zu) on %s", retval, size, netdev_get_name(netdev_));
838 /* Registers with the poll loop to wake up from the next call to poll_block()
839 * when the packet transmission queue has sufficient room to transmit a packet
840 * with netdev_send().
842 * The kernel maintains a packet transmission queue, so the client is not
843 * expected to do additional queuing of packets. Thus, this function is
844 * unlikely to ever be used. It is included for completeness. */
846 netdev_linux_send_wait(struct netdev *netdev_)
848 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
849 if (netdev->fd < 0) {
851 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
852 poll_fd_wait(netdev->fd, POLLOUT);
854 /* TAP device always accepts packets.*/
855 poll_immediate_wake();
859 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
860 * otherwise a positive errno value. */
862 netdev_linux_set_etheraddr(struct netdev *netdev_,
863 const uint8_t mac[ETH_ADDR_LEN])
865 struct netdev_dev_linux *netdev_dev =
866 netdev_dev_linux_cast(netdev_get_dev(netdev_));
869 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
870 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
871 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
873 netdev_dev->cache_valid |= VALID_ETHERADDR;
874 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
882 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
883 * free the returned buffer. */
885 netdev_linux_get_etheraddr(const struct netdev *netdev_,
886 uint8_t mac[ETH_ADDR_LEN])
888 struct netdev_dev_linux *netdev_dev =
889 netdev_dev_linux_cast(netdev_get_dev(netdev_));
890 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
891 int error = get_etheraddr(netdev_get_name(netdev_),
892 netdev_dev->etheraddr);
896 netdev_dev->cache_valid |= VALID_ETHERADDR;
898 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
902 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
903 * in bytes, not including the hardware header; thus, this is typically 1500
904 * bytes for Ethernet devices. */
906 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
908 struct netdev_dev_linux *netdev_dev =
909 netdev_dev_linux_cast(netdev_get_dev(netdev_));
910 if (!(netdev_dev->cache_valid & VALID_MTU)) {
914 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
915 SIOCGIFMTU, "SIOCGIFMTU");
919 netdev_dev->mtu = ifr.ifr_mtu;
920 netdev_dev->cache_valid |= VALID_MTU;
922 *mtup = netdev_dev->mtu;
926 /* Returns the ifindex of 'netdev', if successful, as a positive number.
927 * On failure, returns a negative errno value. */
929 netdev_linux_get_ifindex(const struct netdev *netdev)
933 error = get_ifindex(netdev, &ifindex);
934 return error ? -error : ifindex;
938 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
940 struct netdev_dev_linux *netdev_dev =
941 netdev_dev_linux_cast(netdev_get_dev(netdev_));
946 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
950 fn = xasprintf("/sys/class/net/%s/carrier",
951 netdev_get_name(netdev_));
952 fd = open(fn, O_RDONLY);
955 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
959 retval = read(fd, line, sizeof line);
962 if (error == EINVAL) {
963 /* This is the normal return value when we try to check carrier
964 * if the network device is not up. */
966 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
969 } else if (retval == 0) {
971 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
975 if (line[0] != '0' && line[0] != '1') {
977 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
981 netdev_dev->carrier = line[0] != '0';
982 netdev_dev->cache_valid |= VALID_CARRIER;
984 *carrier = netdev_dev->carrier;
995 /* Check whether we can we use RTM_GETLINK to get network device statistics.
996 * In pre-2.6.19 kernels, this was only available if wireless extensions were
999 check_for_working_netlink_stats(void)
1001 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1002 * preferable, so if that works, we'll use it. */
1003 int ifindex = do_get_ifindex("lo");
1005 VLOG_WARN("failed to get ifindex for lo, "
1006 "obtaining netdev stats from proc");
1009 struct netdev_stats stats;
1010 int error = get_stats_via_netlink(ifindex, &stats);
1012 VLOG_DBG("obtaining netdev stats via rtnetlink");
1015 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1016 "via proc (you are probably running a pre-2.6.19 "
1017 "kernel)", strerror(error));
1023 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1025 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1027 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1028 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1029 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1031 netdev_dev->is_tap = !strcmp(type, "tap");
1032 netdev_dev->is_internal = false;
1033 if (!netdev_dev->is_tap) {
1034 struct ethtool_drvinfo drvinfo;
1037 memset(&drvinfo, 0, sizeof drvinfo);
1038 error = netdev_linux_do_ethtool(name,
1039 (struct ethtool_cmd *)&drvinfo,
1041 "ETHTOOL_GDRVINFO");
1043 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1044 netdev_dev->is_internal = true;
1048 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1053 swap_uint64(uint64_t *a, uint64_t *b)
1060 /* Retrieves current device stats for 'netdev'. */
1062 netdev_linux_get_stats(const struct netdev *netdev_,
1063 struct netdev_stats *stats)
1065 struct netdev_dev_linux *netdev_dev =
1066 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1067 static int use_netlink_stats = -1;
1070 if (netdev_dev->have_vport_stats ||
1071 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1073 error = netdev_vport_get_stats(netdev_, stats);
1074 netdev_dev->have_vport_stats = !error;
1075 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1078 if (!netdev_dev->have_vport_stats) {
1079 if (use_netlink_stats < 0) {
1080 use_netlink_stats = check_for_working_netlink_stats();
1082 if (use_netlink_stats) {
1085 error = get_ifindex(netdev_, &ifindex);
1087 error = get_stats_via_netlink(ifindex, stats);
1090 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1094 /* If this port is an internal port then the transmit and receive stats
1095 * will appear to be swapped relative to the other ports since we are the
1096 * one sending the data, not a remote computer. For consistency, we swap
1097 * them back here. This does not apply if we are getting stats from the
1098 * vport layer because it always tracks stats from the perspective of the
1100 netdev_linux_update_is_pseudo(netdev_dev);
1101 if (!error && !netdev_dev->have_vport_stats &&
1102 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1103 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1104 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1105 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1106 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1107 stats->rx_length_errors = 0;
1108 stats->rx_over_errors = 0;
1109 stats->rx_crc_errors = 0;
1110 stats->rx_frame_errors = 0;
1111 stats->rx_fifo_errors = 0;
1112 stats->rx_missed_errors = 0;
1113 stats->tx_aborted_errors = 0;
1114 stats->tx_carrier_errors = 0;
1115 stats->tx_fifo_errors = 0;
1116 stats->tx_heartbeat_errors = 0;
1117 stats->tx_window_errors = 0;
1123 /* Stores the features supported by 'netdev' into each of '*current',
1124 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1125 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1126 * successful, otherwise a positive errno value. */
1128 netdev_linux_get_features(struct netdev *netdev,
1129 uint32_t *current, uint32_t *advertised,
1130 uint32_t *supported, uint32_t *peer)
1132 struct ethtool_cmd ecmd;
1135 memset(&ecmd, 0, sizeof ecmd);
1136 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1137 ETHTOOL_GSET, "ETHTOOL_GSET");
1142 /* Supported features. */
1144 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1145 *supported |= OFPPF_10MB_HD;
1147 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1148 *supported |= OFPPF_10MB_FD;
1150 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1151 *supported |= OFPPF_100MB_HD;
1153 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1154 *supported |= OFPPF_100MB_FD;
1156 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1157 *supported |= OFPPF_1GB_HD;
1159 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1160 *supported |= OFPPF_1GB_FD;
1162 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1163 *supported |= OFPPF_10GB_FD;
1165 if (ecmd.supported & SUPPORTED_TP) {
1166 *supported |= OFPPF_COPPER;
1168 if (ecmd.supported & SUPPORTED_FIBRE) {
1169 *supported |= OFPPF_FIBER;
1171 if (ecmd.supported & SUPPORTED_Autoneg) {
1172 *supported |= OFPPF_AUTONEG;
1174 if (ecmd.supported & SUPPORTED_Pause) {
1175 *supported |= OFPPF_PAUSE;
1177 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1178 *supported |= OFPPF_PAUSE_ASYM;
1181 /* Advertised features. */
1183 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1184 *advertised |= OFPPF_10MB_HD;
1186 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1187 *advertised |= OFPPF_10MB_FD;
1189 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1190 *advertised |= OFPPF_100MB_HD;
1192 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1193 *advertised |= OFPPF_100MB_FD;
1195 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1196 *advertised |= OFPPF_1GB_HD;
1198 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1199 *advertised |= OFPPF_1GB_FD;
1201 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1202 *advertised |= OFPPF_10GB_FD;
1204 if (ecmd.advertising & ADVERTISED_TP) {
1205 *advertised |= OFPPF_COPPER;
1207 if (ecmd.advertising & ADVERTISED_FIBRE) {
1208 *advertised |= OFPPF_FIBER;
1210 if (ecmd.advertising & ADVERTISED_Autoneg) {
1211 *advertised |= OFPPF_AUTONEG;
1213 if (ecmd.advertising & ADVERTISED_Pause) {
1214 *advertised |= OFPPF_PAUSE;
1216 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1217 *advertised |= OFPPF_PAUSE_ASYM;
1220 /* Current settings. */
1221 if (ecmd.speed == SPEED_10) {
1222 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1223 } else if (ecmd.speed == SPEED_100) {
1224 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1225 } else if (ecmd.speed == SPEED_1000) {
1226 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1227 } else if (ecmd.speed == SPEED_10000) {
1228 *current = OFPPF_10GB_FD;
1233 if (ecmd.port == PORT_TP) {
1234 *current |= OFPPF_COPPER;
1235 } else if (ecmd.port == PORT_FIBRE) {
1236 *current |= OFPPF_FIBER;
1240 *current |= OFPPF_AUTONEG;
1243 /* Peer advertisements. */
1244 *peer = 0; /* XXX */
1249 /* Set the features advertised by 'netdev' to 'advertise'. */
1251 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1253 struct ethtool_cmd ecmd;
1256 memset(&ecmd, 0, sizeof ecmd);
1257 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1258 ETHTOOL_GSET, "ETHTOOL_GSET");
1263 ecmd.advertising = 0;
1264 if (advertise & OFPPF_10MB_HD) {
1265 ecmd.advertising |= ADVERTISED_10baseT_Half;
1267 if (advertise & OFPPF_10MB_FD) {
1268 ecmd.advertising |= ADVERTISED_10baseT_Full;
1270 if (advertise & OFPPF_100MB_HD) {
1271 ecmd.advertising |= ADVERTISED_100baseT_Half;
1273 if (advertise & OFPPF_100MB_FD) {
1274 ecmd.advertising |= ADVERTISED_100baseT_Full;
1276 if (advertise & OFPPF_1GB_HD) {
1277 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1279 if (advertise & OFPPF_1GB_FD) {
1280 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1282 if (advertise & OFPPF_10GB_FD) {
1283 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1285 if (advertise & OFPPF_COPPER) {
1286 ecmd.advertising |= ADVERTISED_TP;
1288 if (advertise & OFPPF_FIBER) {
1289 ecmd.advertising |= ADVERTISED_FIBRE;
1291 if (advertise & OFPPF_AUTONEG) {
1292 ecmd.advertising |= ADVERTISED_Autoneg;
1294 if (advertise & OFPPF_PAUSE) {
1295 ecmd.advertising |= ADVERTISED_Pause;
1297 if (advertise & OFPPF_PAUSE_ASYM) {
1298 ecmd.advertising |= ADVERTISED_Asym_Pause;
1300 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1301 ETHTOOL_SSET, "ETHTOOL_SSET");
1304 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1305 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1306 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1307 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1308 * sets '*vlan_vid' to -1. */
1310 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1312 const char *netdev_name = netdev_get_name(netdev);
1313 struct ds line = DS_EMPTY_INITIALIZER;
1314 FILE *stream = NULL;
1318 COVERAGE_INC(netdev_get_vlan_vid);
1319 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1320 stream = fopen(fn, "r");
1326 if (ds_get_line(&line, stream)) {
1327 if (ferror(stream)) {
1329 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1332 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1337 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1339 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1340 fn, ds_cstr(&line));
1358 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1359 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1361 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1362 * positive errno value.
1364 * This function is equivalent to running
1365 * /sbin/tc qdisc del dev %s handle ffff: ingress
1366 * but it is much, much faster.
1369 netdev_linux_remove_policing(struct netdev *netdev)
1371 struct netdev_dev_linux *netdev_dev =
1372 netdev_dev_linux_cast(netdev_get_dev(netdev));
1373 const char *netdev_name = netdev_get_name(netdev);
1375 struct ofpbuf request;
1376 struct tcmsg *tcmsg;
1379 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1383 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1384 tcmsg->tcm_parent = TC_H_INGRESS;
1385 nl_msg_put_string(&request, TCA_KIND, "ingress");
1386 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1388 error = tc_transact(&request, NULL);
1389 if (error && error != ENOENT && error != EINVAL) {
1390 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1391 netdev_name, strerror(error));
1395 netdev_dev->kbits_rate = 0;
1396 netdev_dev->kbits_burst = 0;
1397 netdev_dev->cache_valid |= VALID_POLICING;
1401 /* Attempts to set input rate limiting (policing) policy. */
1403 netdev_linux_set_policing(struct netdev *netdev,
1404 uint32_t kbits_rate, uint32_t kbits_burst)
1406 struct netdev_dev_linux *netdev_dev =
1407 netdev_dev_linux_cast(netdev_get_dev(netdev));
1408 const char *netdev_name = netdev_get_name(netdev);
1411 COVERAGE_INC(netdev_set_policing);
1413 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1414 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1415 : kbits_burst); /* Stick with user-specified value. */
1417 if (netdev_dev->cache_valid & VALID_POLICING
1418 && netdev_dev->kbits_rate == kbits_rate
1419 && netdev_dev->kbits_burst == kbits_burst) {
1420 /* Assume that settings haven't changed since we last set them. */
1424 netdev_linux_remove_policing(netdev);
1426 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1427 if (system(command) != 0) {
1428 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1432 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1433 kbits_rate, kbits_burst);
1434 if (system(command) != 0) {
1435 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1440 netdev_dev->kbits_rate = kbits_rate;
1441 netdev_dev->kbits_burst = kbits_burst;
1442 netdev_dev->cache_valid |= VALID_POLICING;
1449 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1452 const struct tc_ops **opsp;
1454 for (opsp = tcs; *opsp != NULL; opsp++) {
1455 const struct tc_ops *ops = *opsp;
1456 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1457 svec_add(types, ops->ovs_name);
1463 static const struct tc_ops *
1464 tc_lookup_ovs_name(const char *name)
1466 const struct tc_ops **opsp;
1468 for (opsp = tcs; *opsp != NULL; opsp++) {
1469 const struct tc_ops *ops = *opsp;
1470 if (!strcmp(name, ops->ovs_name)) {
1477 static const struct tc_ops *
1478 tc_lookup_linux_name(const char *name)
1480 const struct tc_ops **opsp;
1482 for (opsp = tcs; *opsp != NULL; opsp++) {
1483 const struct tc_ops *ops = *opsp;
1484 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1491 static struct tc_queue *
1492 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1495 struct netdev_dev_linux *netdev_dev =
1496 netdev_dev_linux_cast(netdev_get_dev(netdev));
1497 struct tc_queue *queue;
1499 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1500 if (queue->queue_id == queue_id) {
1507 static struct tc_queue *
1508 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1510 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1514 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1516 struct netdev_qos_capabilities *caps)
1518 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1522 caps->n_queues = ops->n_queues;
1527 netdev_linux_get_qos(const struct netdev *netdev,
1528 const char **typep, struct shash *details)
1530 struct netdev_dev_linux *netdev_dev =
1531 netdev_dev_linux_cast(netdev_get_dev(netdev));
1534 error = tc_query_qdisc(netdev);
1539 *typep = netdev_dev->tc->ops->ovs_name;
1540 return (netdev_dev->tc->ops->qdisc_get
1541 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1546 netdev_linux_set_qos(struct netdev *netdev,
1547 const char *type, const struct shash *details)
1549 struct netdev_dev_linux *netdev_dev =
1550 netdev_dev_linux_cast(netdev_get_dev(netdev));
1551 const struct tc_ops *new_ops;
1554 new_ops = tc_lookup_ovs_name(type);
1555 if (!new_ops || !new_ops->tc_install) {
1559 error = tc_query_qdisc(netdev);
1564 if (new_ops == netdev_dev->tc->ops) {
1565 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1567 /* Delete existing qdisc. */
1568 error = tc_del_qdisc(netdev);
1572 assert(netdev_dev->tc == NULL);
1574 /* Install new qdisc. */
1575 error = new_ops->tc_install(netdev, details);
1576 assert((error == 0) == (netdev_dev->tc != NULL));
1583 netdev_linux_get_queue(const struct netdev *netdev,
1584 unsigned int queue_id, struct shash *details)
1586 struct netdev_dev_linux *netdev_dev =
1587 netdev_dev_linux_cast(netdev_get_dev(netdev));
1590 error = tc_query_qdisc(netdev);
1594 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1596 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1602 netdev_linux_set_queue(struct netdev *netdev,
1603 unsigned int queue_id, const struct shash *details)
1605 struct netdev_dev_linux *netdev_dev =
1606 netdev_dev_linux_cast(netdev_get_dev(netdev));
1609 error = tc_query_qdisc(netdev);
1612 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1613 || !netdev_dev->tc->ops->class_set) {
1617 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1621 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1623 struct netdev_dev_linux *netdev_dev =
1624 netdev_dev_linux_cast(netdev_get_dev(netdev));
1627 error = tc_query_qdisc(netdev);
1630 } else if (!netdev_dev->tc->ops->class_delete) {
1633 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1635 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1641 netdev_linux_get_queue_stats(const struct netdev *netdev,
1642 unsigned int queue_id,
1643 struct netdev_queue_stats *stats)
1645 struct netdev_dev_linux *netdev_dev =
1646 netdev_dev_linux_cast(netdev_get_dev(netdev));
1649 error = tc_query_qdisc(netdev);
1652 } else if (!netdev_dev->tc->ops->class_get_stats) {
1655 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1657 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1663 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1665 struct ofpbuf request;
1666 struct tcmsg *tcmsg;
1668 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1672 tcmsg->tcm_parent = 0;
1673 nl_dump_start(dump, rtnl_sock, &request);
1674 ofpbuf_uninit(&request);
1679 netdev_linux_dump_queues(const struct netdev *netdev,
1680 netdev_dump_queues_cb *cb, void *aux)
1682 struct netdev_dev_linux *netdev_dev =
1683 netdev_dev_linux_cast(netdev_get_dev(netdev));
1684 struct tc_queue *queue;
1685 struct shash details;
1689 error = tc_query_qdisc(netdev);
1692 } else if (!netdev_dev->tc->ops->class_get) {
1697 shash_init(&details);
1698 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1699 shash_clear(&details);
1701 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1703 (*cb)(queue->queue_id, &details, aux);
1708 shash_destroy(&details);
1714 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1715 netdev_dump_queue_stats_cb *cb, void *aux)
1717 struct netdev_dev_linux *netdev_dev =
1718 netdev_dev_linux_cast(netdev_get_dev(netdev));
1719 struct nl_dump dump;
1724 error = tc_query_qdisc(netdev);
1727 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1732 if (!start_queue_dump(netdev, &dump)) {
1735 while (nl_dump_next(&dump, &msg)) {
1736 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1742 error = nl_dump_done(&dump);
1743 return error ? error : last_error;
1747 netdev_linux_get_in4(const struct netdev *netdev_,
1748 struct in_addr *address, struct in_addr *netmask)
1750 struct netdev_dev_linux *netdev_dev =
1751 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1753 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1756 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1757 SIOCGIFADDR, "SIOCGIFADDR");
1762 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1763 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1768 netdev_dev->cache_valid |= VALID_IN4;
1770 *address = netdev_dev->address;
1771 *netmask = netdev_dev->netmask;
1772 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1776 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1777 struct in_addr netmask)
1779 struct netdev_dev_linux *netdev_dev =
1780 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1783 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1785 netdev_dev->cache_valid |= VALID_IN4;
1786 netdev_dev->address = address;
1787 netdev_dev->netmask = netmask;
1788 if (address.s_addr != INADDR_ANY) {
1789 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1790 "SIOCSIFNETMASK", netmask);
1797 parse_if_inet6_line(const char *line,
1798 struct in6_addr *in6, char ifname[16 + 1])
1800 uint8_t *s6 = in6->s6_addr;
1801 #define X8 "%2"SCNx8
1803 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1804 "%*x %*x %*x %*x %16s\n",
1805 &s6[0], &s6[1], &s6[2], &s6[3],
1806 &s6[4], &s6[5], &s6[6], &s6[7],
1807 &s6[8], &s6[9], &s6[10], &s6[11],
1808 &s6[12], &s6[13], &s6[14], &s6[15],
1812 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1813 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1815 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1817 struct netdev_dev_linux *netdev_dev =
1818 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1819 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1823 netdev_dev->in6 = in6addr_any;
1825 file = fopen("/proc/net/if_inet6", "r");
1827 const char *name = netdev_get_name(netdev_);
1828 while (fgets(line, sizeof line, file)) {
1829 struct in6_addr in6_tmp;
1830 char ifname[16 + 1];
1831 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1832 && !strcmp(name, ifname))
1834 netdev_dev->in6 = in6_tmp;
1840 netdev_dev->cache_valid |= VALID_IN6;
1842 *in6 = netdev_dev->in6;
1847 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1849 struct sockaddr_in sin;
1850 memset(&sin, 0, sizeof sin);
1851 sin.sin_family = AF_INET;
1852 sin.sin_addr = addr;
1855 memset(sa, 0, sizeof *sa);
1856 memcpy(sa, &sin, sizeof sin);
1860 do_set_addr(struct netdev *netdev,
1861 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1864 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1865 make_in4_sockaddr(&ifr.ifr_addr, addr);
1867 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1871 /* Adds 'router' as a default IP gateway. */
1873 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1875 struct in_addr any = { INADDR_ANY };
1879 memset(&rt, 0, sizeof rt);
1880 make_in4_sockaddr(&rt.rt_dst, any);
1881 make_in4_sockaddr(&rt.rt_gateway, router);
1882 make_in4_sockaddr(&rt.rt_genmask, any);
1883 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1884 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1886 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1892 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1895 static const char fn[] = "/proc/net/route";
1900 *netdev_name = NULL;
1901 stream = fopen(fn, "r");
1902 if (stream == NULL) {
1903 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1908 while (fgets(line, sizeof line, stream)) {
1911 uint32_t dest, gateway, mask;
1912 int refcnt, metric, mtu;
1913 unsigned int flags, use, window, irtt;
1916 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1918 iface, &dest, &gateway, &flags, &refcnt,
1919 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1921 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1925 if (!(flags & RTF_UP)) {
1926 /* Skip routes that aren't up. */
1930 /* The output of 'dest', 'mask', and 'gateway' were given in
1931 * network byte order, so we don't need need any endian
1932 * conversions here. */
1933 if ((dest & mask) == (host->s_addr & mask)) {
1935 /* The host is directly reachable. */
1936 next_hop->s_addr = 0;
1938 /* To reach the host, we must go through a gateway. */
1939 next_hop->s_addr = gateway;
1941 *netdev_name = xstrdup(iface);
1952 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1953 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1954 * returns 0. Otherwise, it returns a positive errno value; in particular,
1955 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1957 netdev_linux_arp_lookup(const struct netdev *netdev,
1958 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1961 struct sockaddr_in sin;
1964 memset(&r, 0, sizeof r);
1965 sin.sin_family = AF_INET;
1966 sin.sin_addr.s_addr = ip;
1968 memcpy(&r.arp_pa, &sin, sizeof sin);
1969 r.arp_ha.sa_family = ARPHRD_ETHER;
1971 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1972 COVERAGE_INC(netdev_arp_lookup);
1973 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1975 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1976 } else if (retval != ENXIO) {
1977 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1978 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1984 nd_to_iff_flags(enum netdev_flags nd)
1987 if (nd & NETDEV_UP) {
1990 if (nd & NETDEV_PROMISC) {
1997 iff_to_nd_flags(int iff)
1999 enum netdev_flags nd = 0;
2003 if (iff & IFF_PROMISC) {
2004 nd |= NETDEV_PROMISC;
2010 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2011 enum netdev_flags on, enum netdev_flags *old_flagsp)
2013 int old_flags, new_flags;
2016 error = get_flags(netdev, &old_flags);
2018 *old_flagsp = iff_to_nd_flags(old_flags);
2019 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2020 if (new_flags != old_flags) {
2021 error = set_flags(netdev, new_flags);
2028 poll_notify(struct list *list)
2030 struct netdev_linux_notifier *notifier;
2031 LIST_FOR_EACH (notifier, node, list) {
2032 struct netdev_notifier *n = ¬ifier->notifier;
2038 netdev_linux_poll_cb(const struct rtnetlink_change *change,
2039 void *aux OVS_UNUSED)
2042 struct list *list = shash_find_data(&netdev_linux_notifiers,
2048 struct shash_node *node;
2049 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2050 poll_notify(node->data);
2056 netdev_linux_poll_add(struct netdev *netdev,
2057 void (*cb)(struct netdev_notifier *), void *aux,
2058 struct netdev_notifier **notifierp)
2060 const char *netdev_name = netdev_get_name(netdev);
2061 struct netdev_linux_notifier *notifier;
2064 if (shash_is_empty(&netdev_linux_notifiers)) {
2065 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2066 netdev_linux_poll_cb, NULL);
2072 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2074 list = xmalloc(sizeof *list);
2076 shash_add(&netdev_linux_notifiers, netdev_name, list);
2079 notifier = xmalloc(sizeof *notifier);
2080 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2081 list_push_back(list, ¬ifier->node);
2082 *notifierp = ¬ifier->notifier;
2087 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2089 struct netdev_linux_notifier *notifier =
2090 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2093 /* Remove 'notifier' from its list. */
2094 list = list_remove(¬ifier->node);
2095 if (list_is_empty(list)) {
2096 /* The list is now empty. Remove it from the hash and free it. */
2097 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2098 shash_delete(&netdev_linux_notifiers,
2099 shash_find(&netdev_linux_notifiers, netdev_name));
2104 /* If that was the last notifier, unregister. */
2105 if (shash_is_empty(&netdev_linux_notifiers)) {
2106 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2110 const struct netdev_class netdev_linux_class = {
2117 netdev_linux_create_system,
2118 netdev_linux_destroy,
2119 NULL, /* reconfigure */
2124 netdev_linux_enumerate,
2127 netdev_linux_recv_wait,
2131 netdev_linux_send_wait,
2133 netdev_linux_set_etheraddr,
2134 netdev_linux_get_etheraddr,
2135 netdev_linux_get_mtu,
2136 netdev_linux_get_ifindex,
2137 netdev_linux_get_carrier,
2138 netdev_linux_get_stats,
2139 netdev_vport_set_stats,
2141 netdev_linux_get_features,
2142 netdev_linux_set_advertisements,
2143 netdev_linux_get_vlan_vid,
2145 netdev_linux_set_policing,
2146 netdev_linux_get_qos_types,
2147 netdev_linux_get_qos_capabilities,
2148 netdev_linux_get_qos,
2149 netdev_linux_set_qos,
2150 netdev_linux_get_queue,
2151 netdev_linux_set_queue,
2152 netdev_linux_delete_queue,
2153 netdev_linux_get_queue_stats,
2154 netdev_linux_dump_queues,
2155 netdev_linux_dump_queue_stats,
2157 netdev_linux_get_in4,
2158 netdev_linux_set_in4,
2159 netdev_linux_get_in6,
2160 netdev_linux_add_router,
2161 netdev_linux_get_next_hop,
2162 netdev_linux_arp_lookup,
2164 netdev_linux_update_flags,
2166 netdev_linux_poll_add,
2167 netdev_linux_poll_remove,
2170 const struct netdev_class netdev_tap_class = {
2177 netdev_linux_create_tap,
2178 netdev_linux_destroy,
2179 NULL, /* reconfigure */
2184 NULL, /* enumerate */
2187 netdev_linux_recv_wait,
2191 netdev_linux_send_wait,
2193 netdev_linux_set_etheraddr,
2194 netdev_linux_get_etheraddr,
2195 netdev_linux_get_mtu,
2196 netdev_linux_get_ifindex,
2197 netdev_linux_get_carrier,
2198 netdev_linux_get_stats,
2199 NULL, /* set_stats */
2201 netdev_linux_get_features,
2202 netdev_linux_set_advertisements,
2203 netdev_linux_get_vlan_vid,
2205 netdev_linux_set_policing,
2206 netdev_linux_get_qos_types,
2207 netdev_linux_get_qos_capabilities,
2208 netdev_linux_get_qos,
2209 netdev_linux_set_qos,
2210 netdev_linux_get_queue,
2211 netdev_linux_set_queue,
2212 netdev_linux_delete_queue,
2213 netdev_linux_get_queue_stats,
2214 netdev_linux_dump_queues,
2215 netdev_linux_dump_queue_stats,
2217 netdev_linux_get_in4,
2218 netdev_linux_set_in4,
2219 netdev_linux_get_in6,
2220 netdev_linux_add_router,
2221 netdev_linux_get_next_hop,
2222 netdev_linux_arp_lookup,
2224 netdev_linux_update_flags,
2226 netdev_linux_poll_add,
2227 netdev_linux_poll_remove,
2230 /* HTB traffic control class. */
2232 #define HTB_N_QUEUES 0xf000
2236 unsigned int max_rate; /* In bytes/s. */
2240 struct tc_queue tc_queue;
2241 unsigned int min_rate; /* In bytes/s. */
2242 unsigned int max_rate; /* In bytes/s. */
2243 unsigned int burst; /* In bytes. */
2244 unsigned int priority; /* Lower values are higher priorities. */
2248 htb_get__(const struct netdev *netdev)
2250 struct netdev_dev_linux *netdev_dev =
2251 netdev_dev_linux_cast(netdev_get_dev(netdev));
2252 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2256 htb_install__(struct netdev *netdev, uint64_t max_rate)
2258 struct netdev_dev_linux *netdev_dev =
2259 netdev_dev_linux_cast(netdev_get_dev(netdev));
2262 htb = xmalloc(sizeof *htb);
2263 tc_init(&htb->tc, &tc_ops_htb);
2264 htb->max_rate = max_rate;
2266 netdev_dev->tc = &htb->tc;
2271 /* Create an HTB qdisc.
2273 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2275 htb_setup_qdisc__(struct netdev *netdev)
2278 struct tc_htb_glob opt;
2279 struct ofpbuf request;
2280 struct tcmsg *tcmsg;
2282 tc_del_qdisc(netdev);
2284 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2285 NLM_F_EXCL | NLM_F_CREATE, &request);
2289 tcmsg->tcm_handle = tc_make_handle(1, 0);
2290 tcmsg->tcm_parent = TC_H_ROOT;
2292 nl_msg_put_string(&request, TCA_KIND, "htb");
2294 memset(&opt, 0, sizeof opt);
2295 opt.rate2quantum = 10;
2299 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2300 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2301 nl_msg_end_nested(&request, opt_offset);
2303 return tc_transact(&request, NULL);
2306 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2307 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2309 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2310 unsigned int parent, struct htb_class *class)
2313 struct tc_htb_opt opt;
2314 struct ofpbuf request;
2315 struct tcmsg *tcmsg;
2319 netdev_get_mtu(netdev, &mtu);
2321 memset(&opt, 0, sizeof opt);
2322 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2323 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2324 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2325 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2326 opt.prio = class->priority;
2328 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2332 tcmsg->tcm_handle = handle;
2333 tcmsg->tcm_parent = parent;
2335 nl_msg_put_string(&request, TCA_KIND, "htb");
2336 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2337 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2338 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2339 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2340 nl_msg_end_nested(&request, opt_offset);
2342 error = tc_transact(&request, NULL);
2344 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2345 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2346 netdev_get_name(netdev),
2347 tc_get_major(handle), tc_get_minor(handle),
2348 tc_get_major(parent), tc_get_minor(parent),
2349 class->min_rate, class->max_rate,
2350 class->burst, class->priority, strerror(error));
2355 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2356 * description of them into 'details'. The description complies with the
2357 * specification given in the vswitch database documentation for linux-htb
2360 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2362 static const struct nl_policy tca_htb_policy[] = {
2363 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2364 .min_len = sizeof(struct tc_htb_opt) },
2367 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2368 const struct tc_htb_opt *htb;
2370 if (!nl_parse_nested(nl_options, tca_htb_policy,
2371 attrs, ARRAY_SIZE(tca_htb_policy))) {
2372 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2376 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2377 class->min_rate = htb->rate.rate;
2378 class->max_rate = htb->ceil.rate;
2379 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2380 class->priority = htb->prio;
2385 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2386 struct htb_class *options,
2387 struct netdev_queue_stats *stats)
2389 struct nlattr *nl_options;
2390 unsigned int handle;
2393 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2394 if (!error && queue_id) {
2395 unsigned int major = tc_get_major(handle);
2396 unsigned int minor = tc_get_minor(handle);
2397 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2398 *queue_id = minor - 1;
2403 if (!error && options) {
2404 error = htb_parse_tca_options__(nl_options, options);
2410 htb_parse_qdisc_details__(struct netdev *netdev,
2411 const struct shash *details, struct htb_class *hc)
2413 const char *max_rate_s;
2415 max_rate_s = shash_find_data(details, "max-rate");
2416 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2417 if (!hc->max_rate) {
2420 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2421 hc->max_rate = netdev_features_to_bps(current) / 8;
2423 hc->min_rate = hc->max_rate;
2429 htb_parse_class_details__(struct netdev *netdev,
2430 const struct shash *details, struct htb_class *hc)
2432 const struct htb *htb = htb_get__(netdev);
2433 const char *min_rate_s = shash_find_data(details, "min-rate");
2434 const char *max_rate_s = shash_find_data(details, "max-rate");
2435 const char *burst_s = shash_find_data(details, "burst");
2436 const char *priority_s = shash_find_data(details, "priority");
2439 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2441 /* min-rate is required. */
2444 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2445 hc->min_rate = MAX(hc->min_rate, 1500);
2446 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2449 hc->max_rate = (max_rate_s
2450 ? strtoull(max_rate_s, NULL, 10) / 8
2452 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2453 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2457 * According to hints in the documentation that I've read, it is important
2458 * that 'burst' be at least as big as the largest frame that might be
2459 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2460 * but having it a bit too small is a problem. Since netdev_get_mtu()
2461 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2462 * the MTU. We actually add 64, instead of 14, as a guard against
2463 * additional headers get tacked on somewhere that we're not aware of. */
2464 netdev_get_mtu(netdev, &mtu);
2465 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2466 hc->burst = MAX(hc->burst, mtu + 64);
2469 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2475 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2476 unsigned int parent, struct htb_class *options,
2477 struct netdev_queue_stats *stats)
2479 struct ofpbuf *reply;
2482 error = tc_query_class(netdev, handle, parent, &reply);
2484 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2485 ofpbuf_delete(reply);
2491 htb_tc_install(struct netdev *netdev, const struct shash *details)
2495 error = htb_setup_qdisc__(netdev);
2497 struct htb_class hc;
2499 htb_parse_qdisc_details__(netdev, details, &hc);
2500 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2501 tc_make_handle(1, 0), &hc);
2503 htb_install__(netdev, hc.max_rate);
2509 static struct htb_class *
2510 htb_class_cast__(const struct tc_queue *queue)
2512 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2516 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2517 const struct htb_class *hc)
2519 struct htb *htb = htb_get__(netdev);
2520 size_t hash = hash_int(queue_id, 0);
2521 struct tc_queue *queue;
2522 struct htb_class *hcp;
2524 queue = tc_find_queue__(netdev, queue_id, hash);
2526 hcp = htb_class_cast__(queue);
2528 hcp = xmalloc(sizeof *hcp);
2529 queue = &hcp->tc_queue;
2530 queue->queue_id = queue_id;
2531 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2534 hcp->min_rate = hc->min_rate;
2535 hcp->max_rate = hc->max_rate;
2536 hcp->burst = hc->burst;
2537 hcp->priority = hc->priority;
2541 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2544 struct nl_dump dump;
2545 struct htb_class hc;
2548 /* Get qdisc options. */
2550 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2551 htb = htb_install__(netdev, hc.max_rate);
2554 if (!start_queue_dump(netdev, &dump)) {
2557 while (nl_dump_next(&dump, &msg)) {
2558 unsigned int queue_id;
2560 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2561 htb_update_queue__(netdev, queue_id, &hc);
2564 nl_dump_done(&dump);
2570 htb_tc_destroy(struct tc *tc)
2572 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2573 struct htb_class *hc, *next;
2575 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2576 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2584 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2586 const struct htb *htb = htb_get__(netdev);
2587 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2592 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2594 struct htb_class hc;
2597 htb_parse_qdisc_details__(netdev, details, &hc);
2598 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2599 tc_make_handle(1, 0), &hc);
2601 htb_get__(netdev)->max_rate = hc.max_rate;
2607 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2608 const struct tc_queue *queue, struct shash *details)
2610 const struct htb_class *hc = htb_class_cast__(queue);
2612 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2613 if (hc->min_rate != hc->max_rate) {
2614 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2616 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2618 shash_add(details, "priority", xasprintf("%u", hc->priority));
2624 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2625 const struct shash *details)
2627 struct htb_class hc;
2630 error = htb_parse_class_details__(netdev, details, &hc);
2635 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2636 tc_make_handle(1, 0xfffe), &hc);
2641 htb_update_queue__(netdev, queue_id, &hc);
2646 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2648 struct htb_class *hc = htb_class_cast__(queue);
2649 struct htb *htb = htb_get__(netdev);
2652 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2654 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2661 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2662 struct netdev_queue_stats *stats)
2664 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2665 tc_make_handle(1, 0xfffe), NULL, stats);
2669 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2670 const struct ofpbuf *nlmsg,
2671 netdev_dump_queue_stats_cb *cb, void *aux)
2673 struct netdev_queue_stats stats;
2674 unsigned int handle, major, minor;
2677 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2682 major = tc_get_major(handle);
2683 minor = tc_get_minor(handle);
2684 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2685 (*cb)(minor - 1, &stats, aux);
2690 static const struct tc_ops tc_ops_htb = {
2691 "htb", /* linux_name */
2692 "linux-htb", /* ovs_name */
2693 HTB_N_QUEUES, /* n_queues */
2702 htb_class_get_stats,
2703 htb_class_dump_stats
2706 /* "linux-hfsc" traffic control class. */
2708 #define HFSC_N_QUEUES 0xf000
2716 struct tc_queue tc_queue;
2721 static struct hfsc *
2722 hfsc_get__(const struct netdev *netdev)
2724 struct netdev_dev_linux *netdev_dev;
2725 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2726 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2729 static struct hfsc_class *
2730 hfsc_class_cast__(const struct tc_queue *queue)
2732 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2735 static struct hfsc *
2736 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2738 struct netdev_dev_linux * netdev_dev;
2741 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2742 hfsc = xmalloc(sizeof *hfsc);
2743 tc_init(&hfsc->tc, &tc_ops_hfsc);
2744 hfsc->max_rate = max_rate;
2745 netdev_dev->tc = &hfsc->tc;
2751 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2752 const struct hfsc_class *hc)
2756 struct hfsc_class *hcp;
2757 struct tc_queue *queue;
2759 hfsc = hfsc_get__(netdev);
2760 hash = hash_int(queue_id, 0);
2762 queue = tc_find_queue__(netdev, queue_id, hash);
2764 hcp = hfsc_class_cast__(queue);
2766 hcp = xmalloc(sizeof *hcp);
2767 queue = &hcp->tc_queue;
2768 queue->queue_id = queue_id;
2769 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2772 hcp->min_rate = hc->min_rate;
2773 hcp->max_rate = hc->max_rate;
2777 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2779 const struct tc_service_curve *rsc, *fsc, *usc;
2780 static const struct nl_policy tca_hfsc_policy[] = {
2782 .type = NL_A_UNSPEC,
2784 .min_len = sizeof(struct tc_service_curve),
2787 .type = NL_A_UNSPEC,
2789 .min_len = sizeof(struct tc_service_curve),
2792 .type = NL_A_UNSPEC,
2794 .min_len = sizeof(struct tc_service_curve),
2797 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2799 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2800 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2801 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2805 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2806 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2807 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2809 if (rsc->m1 != 0 || rsc->d != 0 ||
2810 fsc->m1 != 0 || fsc->d != 0 ||
2811 usc->m1 != 0 || usc->d != 0) {
2812 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2813 "Non-linear service curves are not supported.");
2817 if (rsc->m2 != fsc->m2) {
2818 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2819 "Real-time service curves are not supported ");
2823 if (rsc->m2 > usc->m2) {
2824 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2825 "Min-rate service curve is greater than "
2826 "the max-rate service curve.");
2830 class->min_rate = fsc->m2;
2831 class->max_rate = usc->m2;
2836 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2837 struct hfsc_class *options,
2838 struct netdev_queue_stats *stats)
2841 unsigned int handle;
2842 struct nlattr *nl_options;
2844 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2850 unsigned int major, minor;
2852 major = tc_get_major(handle);
2853 minor = tc_get_minor(handle);
2854 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2855 *queue_id = minor - 1;
2862 error = hfsc_parse_tca_options__(nl_options, options);
2869 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2870 unsigned int parent, struct hfsc_class *options,
2871 struct netdev_queue_stats *stats)
2874 struct ofpbuf *reply;
2876 error = tc_query_class(netdev, handle, parent, &reply);
2881 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2882 ofpbuf_delete(reply);
2887 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2888 struct hfsc_class *class)
2891 const char *max_rate_s;
2893 max_rate_s = shash_find_data(details, "max-rate");
2894 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2899 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2900 max_rate = netdev_features_to_bps(current) / 8;
2903 class->min_rate = max_rate;
2904 class->max_rate = max_rate;
2908 hfsc_parse_class_details__(struct netdev *netdev,
2909 const struct shash *details,
2910 struct hfsc_class * class)
2912 const struct hfsc *hfsc;
2913 uint32_t min_rate, max_rate;
2914 const char *min_rate_s, *max_rate_s;
2916 hfsc = hfsc_get__(netdev);
2917 min_rate_s = shash_find_data(details, "min-rate");
2918 max_rate_s = shash_find_data(details, "max-rate");
2924 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2925 min_rate = MAX(min_rate, 1500);
2926 min_rate = MIN(min_rate, hfsc->max_rate);
2928 max_rate = (max_rate_s
2929 ? strtoull(max_rate_s, NULL, 10) / 8
2931 max_rate = MAX(max_rate, min_rate);
2932 max_rate = MIN(max_rate, hfsc->max_rate);
2934 class->min_rate = min_rate;
2935 class->max_rate = max_rate;
2940 /* Create an HFSC qdisc.
2942 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2944 hfsc_setup_qdisc__(struct netdev * netdev)
2946 struct tcmsg *tcmsg;
2947 struct ofpbuf request;
2948 struct tc_hfsc_qopt opt;
2950 tc_del_qdisc(netdev);
2952 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2953 NLM_F_EXCL | NLM_F_CREATE, &request);
2959 tcmsg->tcm_handle = tc_make_handle(1, 0);
2960 tcmsg->tcm_parent = TC_H_ROOT;
2962 memset(&opt, 0, sizeof opt);
2965 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2966 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2968 return tc_transact(&request, NULL);
2971 /* Create an HFSC class.
2973 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2974 * sc rate <min_rate> ul rate <max_rate>" */
2976 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
2977 unsigned int parent, struct hfsc_class *class)
2981 struct tcmsg *tcmsg;
2982 struct ofpbuf request;
2983 struct tc_service_curve min, max;
2985 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2991 tcmsg->tcm_handle = handle;
2992 tcmsg->tcm_parent = parent;
2996 min.m2 = class->min_rate;
3000 max.m2 = class->max_rate;
3002 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3003 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3004 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3005 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3006 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3007 nl_msg_end_nested(&request, opt_offset);
3009 error = tc_transact(&request, NULL);
3011 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3012 "min-rate %ubps, max-rate %ubps (%s)",
3013 netdev_get_name(netdev),
3014 tc_get_major(handle), tc_get_minor(handle),
3015 tc_get_major(parent), tc_get_minor(parent),
3016 class->min_rate, class->max_rate, strerror(error));
3023 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3026 struct hfsc_class class;
3028 error = hfsc_setup_qdisc__(netdev);
3034 hfsc_parse_qdisc_details__(netdev, details, &class);
3035 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3036 tc_make_handle(1, 0), &class);
3042 hfsc_install__(netdev, class.max_rate);
3047 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3051 struct nl_dump dump;
3052 struct hfsc_class hc;
3055 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3056 hfsc = hfsc_install__(netdev, hc.max_rate);
3058 if (!start_queue_dump(netdev, &dump)) {
3062 while (nl_dump_next(&dump, &msg)) {
3063 unsigned int queue_id;
3065 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3066 hfsc_update_queue__(netdev, queue_id, &hc);
3070 nl_dump_done(&dump);
3075 hfsc_tc_destroy(struct tc *tc)
3078 struct hfsc_class *hc, *next;
3080 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3082 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3083 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3092 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3094 const struct hfsc *hfsc;
3095 hfsc = hfsc_get__(netdev);
3096 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3101 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3104 struct hfsc_class class;
3106 hfsc_parse_qdisc_details__(netdev, details, &class);
3107 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3108 tc_make_handle(1, 0), &class);
3111 hfsc_get__(netdev)->max_rate = class.max_rate;
3118 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3119 const struct tc_queue *queue, struct shash *details)
3121 const struct hfsc_class *hc;
3123 hc = hfsc_class_cast__(queue);
3124 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3125 if (hc->min_rate != hc->max_rate) {
3126 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3132 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3133 const struct shash *details)
3136 struct hfsc_class class;
3138 error = hfsc_parse_class_details__(netdev, details, &class);
3143 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3144 tc_make_handle(1, 0xfffe), &class);
3149 hfsc_update_queue__(netdev, queue_id, &class);
3154 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3158 struct hfsc_class *hc;
3160 hc = hfsc_class_cast__(queue);
3161 hfsc = hfsc_get__(netdev);
3163 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3165 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3172 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3173 struct netdev_queue_stats *stats)
3175 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3176 tc_make_handle(1, 0xfffe), NULL, stats);
3180 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3181 const struct ofpbuf *nlmsg,
3182 netdev_dump_queue_stats_cb *cb, void *aux)
3184 struct netdev_queue_stats stats;
3185 unsigned int handle, major, minor;
3188 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3193 major = tc_get_major(handle);
3194 minor = tc_get_minor(handle);
3195 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3196 (*cb)(minor - 1, &stats, aux);
3201 static const struct tc_ops tc_ops_hfsc = {
3202 "hfsc", /* linux_name */
3203 "linux-hfsc", /* ovs_name */
3204 HFSC_N_QUEUES, /* n_queues */
3205 hfsc_tc_install, /* tc_install */
3206 hfsc_tc_load, /* tc_load */
3207 hfsc_tc_destroy, /* tc_destroy */
3208 hfsc_qdisc_get, /* qdisc_get */
3209 hfsc_qdisc_set, /* qdisc_set */
3210 hfsc_class_get, /* class_get */
3211 hfsc_class_set, /* class_set */
3212 hfsc_class_delete, /* class_delete */
3213 hfsc_class_get_stats, /* class_get_stats */
3214 hfsc_class_dump_stats /* class_dump_stats */
3217 /* "linux-default" traffic control class.
3219 * This class represents the default, unnamed Linux qdisc. It corresponds to
3220 * the "" (empty string) QoS type in the OVS database. */
3223 default_install__(struct netdev *netdev)
3225 struct netdev_dev_linux *netdev_dev =
3226 netdev_dev_linux_cast(netdev_get_dev(netdev));
3227 static struct tc *tc;
3230 tc = xmalloc(sizeof *tc);
3231 tc_init(tc, &tc_ops_default);
3233 netdev_dev->tc = tc;
3237 default_tc_install(struct netdev *netdev,
3238 const struct shash *details OVS_UNUSED)
3240 default_install__(netdev);
3245 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3247 default_install__(netdev);
3251 static const struct tc_ops tc_ops_default = {
3252 NULL, /* linux_name */
3257 NULL, /* tc_destroy */
3258 NULL, /* qdisc_get */
3259 NULL, /* qdisc_set */
3260 NULL, /* class_get */
3261 NULL, /* class_set */
3262 NULL, /* class_delete */
3263 NULL, /* class_get_stats */
3264 NULL /* class_dump_stats */
3267 /* "linux-other" traffic control class.
3272 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3274 struct netdev_dev_linux *netdev_dev =
3275 netdev_dev_linux_cast(netdev_get_dev(netdev));
3276 static struct tc *tc;
3279 tc = xmalloc(sizeof *tc);
3280 tc_init(tc, &tc_ops_other);
3282 netdev_dev->tc = tc;
3286 static const struct tc_ops tc_ops_other = {
3287 NULL, /* linux_name */
3288 "linux-other", /* ovs_name */
3290 NULL, /* tc_install */
3292 NULL, /* tc_destroy */
3293 NULL, /* qdisc_get */
3294 NULL, /* qdisc_set */
3295 NULL, /* class_get */
3296 NULL, /* class_set */
3297 NULL, /* class_delete */
3298 NULL, /* class_get_stats */
3299 NULL /* class_dump_stats */
3302 /* Traffic control. */
3304 /* Number of kernel "tc" ticks per second. */
3305 static double ticks_per_s;
3307 /* Number of kernel "jiffies" per second. This is used for the purpose of
3308 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3309 * one jiffy's worth of data.
3311 * There are two possibilities here:
3313 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3314 * approximate range of 100 to 1024. That means that we really need to
3315 * make sure that the qdisc can buffer that much data.
3317 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3318 * has finely granular timers and there's no need to fudge additional room
3319 * for buffers. (There's no extra effort needed to implement that: the
3320 * large 'buffer_hz' is used as a divisor, so practically any number will
3321 * come out as 0 in the division. Small integer results in the case of
3322 * really high dividends won't have any real effect anyhow.)
3324 static unsigned int buffer_hz;
3326 /* Returns tc handle 'major':'minor'. */
3328 tc_make_handle(unsigned int major, unsigned int minor)
3330 return TC_H_MAKE(major << 16, minor);
3333 /* Returns the major number from 'handle'. */
3335 tc_get_major(unsigned int handle)
3337 return TC_H_MAJ(handle) >> 16;
3340 /* Returns the minor number from 'handle'. */
3342 tc_get_minor(unsigned int handle)
3344 return TC_H_MIN(handle);
3347 static struct tcmsg *
3348 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3349 struct ofpbuf *request)
3351 struct tcmsg *tcmsg;
3355 error = get_ifindex(netdev, &ifindex);
3360 ofpbuf_init(request, 512);
3361 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3362 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3363 tcmsg->tcm_family = AF_UNSPEC;
3364 tcmsg->tcm_ifindex = ifindex;
3365 /* Caller should fill in tcmsg->tcm_handle. */
3366 /* Caller should fill in tcmsg->tcm_parent. */
3372 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3374 int error = nl_sock_transact(rtnl_sock, request, replyp);
3375 ofpbuf_uninit(request);
3382 /* The values in psched are not individually very meaningful, but they are
3383 * important. The tables below show some values seen in the wild.
3387 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3388 * (Before that, there are hints that it was 1000000000.)
3390 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3394 * -----------------------------------
3395 * [1] 000c8000 000f4240 000f4240 00000064
3396 * [2] 000003e8 00000400 000f4240 3b9aca00
3397 * [3] 000003e8 00000400 000f4240 3b9aca00
3398 * [4] 000003e8 00000400 000f4240 00000064
3399 * [5] 000003e8 00000040 000f4240 3b9aca00
3400 * [6] 000003e8 00000040 000f4240 000000f9
3402 * a b c d ticks_per_s buffer_hz
3403 * ------- --------- ---------- ------------- ----------- -------------
3404 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3405 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3406 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3407 * [4] 1,000 1,024 1,000,000 100 976,562 100
3408 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3409 * [6] 1,000 64 1,000,000 249 15,625,000 249
3411 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3412 * [2] 2.6.26-1-686-bigmem from Debian lenny
3413 * [3] 2.6.26-2-sparc64 from Debian lenny
3414 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3415 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3416 * [6] 2.6.34 from kernel.org on KVM
3418 static const char fn[] = "/proc/net/psched";
3419 unsigned int a, b, c, d;
3425 stream = fopen(fn, "r");
3427 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3431 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3432 VLOG_WARN("%s: read failed", fn);
3436 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3440 VLOG_WARN("%s: invalid scheduler parameters", fn);
3444 ticks_per_s = (double) a * c / b;
3448 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3451 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3454 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3455 * rate of 'rate' bytes per second. */
3457 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3462 return (rate * ticks) / ticks_per_s;
3465 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3466 * rate of 'rate' bytes per second. */
3468 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3473 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3476 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3477 * a transmission rate of 'rate' bytes per second. */
3479 tc_buffer_per_jiffy(unsigned int rate)
3484 return rate / buffer_hz;
3487 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3488 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3489 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3490 * stores NULL into it if it is absent.
3492 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3495 * Returns 0 if successful, otherwise a positive errno value. */
3497 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3498 struct nlattr **options)
3500 static const struct nl_policy tca_policy[] = {
3501 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3502 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3504 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3506 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3507 tca_policy, ta, ARRAY_SIZE(ta))) {
3508 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3513 *kind = nl_attr_get_string(ta[TCA_KIND]);
3517 *options = ta[TCA_OPTIONS];
3532 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3533 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3534 * into '*options', and its queue statistics into '*stats'. Any of the output
3535 * arguments may be null.
3537 * Returns 0 if successful, otherwise a positive errno value. */
3539 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3540 struct nlattr **options, struct netdev_queue_stats *stats)
3542 static const struct nl_policy tca_policy[] = {
3543 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3544 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3546 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3548 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3549 tca_policy, ta, ARRAY_SIZE(ta))) {
3550 VLOG_WARN_RL(&rl, "failed to parse class message");
3555 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3556 *handlep = tc->tcm_handle;
3560 *options = ta[TCA_OPTIONS];
3564 const struct gnet_stats_queue *gsq;
3565 struct gnet_stats_basic gsb;
3567 static const struct nl_policy stats_policy[] = {
3568 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3569 .min_len = sizeof gsb },
3570 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3571 .min_len = sizeof *gsq },
3573 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3575 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3576 sa, ARRAY_SIZE(sa))) {
3577 VLOG_WARN_RL(&rl, "failed to parse class stats");
3581 /* Alignment issues screw up the length of struct gnet_stats_basic on
3582 * some arch/bitsize combinations. Newer versions of Linux have a
3583 * struct gnet_stats_basic_packed, but we can't depend on that. The
3584 * easiest thing to do is just to make a copy. */
3585 memset(&gsb, 0, sizeof gsb);
3586 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3587 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3588 stats->tx_bytes = gsb.bytes;
3589 stats->tx_packets = gsb.packets;
3591 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3592 stats->tx_errors = gsq->drops;
3602 memset(stats, 0, sizeof *stats);
3607 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3610 tc_query_class(const struct netdev *netdev,
3611 unsigned int handle, unsigned int parent,
3612 struct ofpbuf **replyp)
3614 struct ofpbuf request;
3615 struct tcmsg *tcmsg;
3618 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3622 tcmsg->tcm_handle = handle;
3623 tcmsg->tcm_parent = parent;
3625 error = tc_transact(&request, replyp);
3627 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3628 netdev_get_name(netdev),
3629 tc_get_major(handle), tc_get_minor(handle),
3630 tc_get_major(parent), tc_get_minor(parent),
3636 /* Equivalent to "tc class del dev <name> handle <handle>". */
3638 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3640 struct ofpbuf request;
3641 struct tcmsg *tcmsg;
3644 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3648 tcmsg->tcm_handle = handle;
3649 tcmsg->tcm_parent = 0;
3651 error = tc_transact(&request, NULL);
3653 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3654 netdev_get_name(netdev),
3655 tc_get_major(handle), tc_get_minor(handle),
3661 /* Equivalent to "tc qdisc del dev <name> root". */
3663 tc_del_qdisc(struct netdev *netdev)
3665 struct netdev_dev_linux *netdev_dev =
3666 netdev_dev_linux_cast(netdev_get_dev(netdev));
3667 struct ofpbuf request;
3668 struct tcmsg *tcmsg;
3671 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3675 tcmsg->tcm_handle = tc_make_handle(1, 0);
3676 tcmsg->tcm_parent = TC_H_ROOT;
3678 error = tc_transact(&request, NULL);
3679 if (error == EINVAL) {
3680 /* EINVAL probably means that the default qdisc was in use, in which
3681 * case we've accomplished our purpose. */
3684 if (!error && netdev_dev->tc) {
3685 if (netdev_dev->tc->ops->tc_destroy) {
3686 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3688 netdev_dev->tc = NULL;
3693 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3694 * kernel to determine what they are. Returns 0 if successful, otherwise a
3695 * positive errno value. */
3697 tc_query_qdisc(const struct netdev *netdev)
3699 struct netdev_dev_linux *netdev_dev =
3700 netdev_dev_linux_cast(netdev_get_dev(netdev));
3701 struct ofpbuf request, *qdisc;
3702 const struct tc_ops *ops;
3703 struct tcmsg *tcmsg;
3707 if (netdev_dev->tc) {
3711 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3712 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3713 * 2.6.35 without that fix backported to it.
3715 * To avoid the OOPS, we must not make a request that would attempt to dump
3716 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3717 * few others. There are a few ways that I can see to do this, but most of
3718 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3719 * technique chosen here is to assume that any non-default qdisc that we
3720 * create will have a class with handle 1:0. The built-in qdiscs only have
3721 * a class with handle 0:0.
3723 * We could check for Linux 2.6.35+ and use a more straightforward method
3725 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3729 tcmsg->tcm_handle = tc_make_handle(1, 0);
3730 tcmsg->tcm_parent = 0;
3732 /* Figure out what tc class to instantiate. */
3733 error = tc_transact(&request, &qdisc);
3737 error = tc_parse_qdisc(qdisc, &kind, NULL);
3739 ops = &tc_ops_other;
3741 ops = tc_lookup_linux_name(kind);
3743 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3744 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3746 ops = &tc_ops_other;
3749 } else if (error == ENOENT) {
3750 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3751 * other entity that doesn't have a handle 1:0. We will assume
3752 * that it's the system default qdisc. */
3753 ops = &tc_ops_default;
3756 /* Who knows? Maybe the device got deleted. */
3757 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3758 netdev_get_name(netdev), strerror(error));
3759 ops = &tc_ops_other;
3762 /* Instantiate it. */
3763 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3764 assert((load_error == 0) == (netdev_dev->tc != NULL));
3765 ofpbuf_delete(qdisc);
3767 return error ? error : load_error;
3770 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3771 approximate the time to transmit packets of various lengths. For an MTU of
3772 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3773 represents two possible packet lengths; for a MTU of 513 through 1024, four
3774 possible lengths; and so on.
3776 Returns, for the specified 'mtu', the number of bits that packet lengths
3777 need to be shifted right to fit within such a 256-entry table. */
3779 tc_calc_cell_log(unsigned int mtu)
3784 mtu = ETH_PAYLOAD_MAX;
3786 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3788 for (cell_log = 0; mtu >= 256; cell_log++) {
3795 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3798 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3800 memset(rate, 0, sizeof *rate);
3801 rate->cell_log = tc_calc_cell_log(mtu);
3802 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3803 /* rate->cell_align = 0; */ /* distro headers. */
3804 rate->mpu = ETH_TOTAL_MIN;
3808 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3809 * attribute of the specified "type".
3811 * See tc_calc_cell_log() above for a description of "rtab"s. */
3813 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3818 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3819 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3820 unsigned packet_size = (i + 1) << rate->cell_log;
3821 if (packet_size < rate->mpu) {
3822 packet_size = rate->mpu;
3824 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3828 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3829 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3830 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3833 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3835 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3836 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3840 /* Utility functions. */
3843 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3845 /* Policy for RTNLGRP_LINK messages.
3847 * There are *many* more fields in these messages, but currently we only
3848 * care about these fields. */
3849 static const struct nl_policy rtnlgrp_link_policy[] = {
3850 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3851 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3852 .min_len = sizeof(struct rtnl_link_stats) },
3855 struct ofpbuf request;
3856 struct ofpbuf *reply;
3857 struct ifinfomsg *ifi;
3858 const struct rtnl_link_stats *rtnl_stats;
3859 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3862 ofpbuf_init(&request, 0);
3863 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3864 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3865 ifi->ifi_family = PF_UNSPEC;
3866 ifi->ifi_index = ifindex;
3867 error = nl_sock_transact(rtnl_sock, &request, &reply);
3868 ofpbuf_uninit(&request);
3873 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3874 rtnlgrp_link_policy,
3875 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3876 ofpbuf_delete(reply);
3880 if (!attrs[IFLA_STATS]) {
3881 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3882 ofpbuf_delete(reply);
3886 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3887 stats->rx_packets = rtnl_stats->rx_packets;
3888 stats->tx_packets = rtnl_stats->tx_packets;
3889 stats->rx_bytes = rtnl_stats->rx_bytes;
3890 stats->tx_bytes = rtnl_stats->tx_bytes;
3891 stats->rx_errors = rtnl_stats->rx_errors;
3892 stats->tx_errors = rtnl_stats->tx_errors;
3893 stats->rx_dropped = rtnl_stats->rx_dropped;
3894 stats->tx_dropped = rtnl_stats->tx_dropped;
3895 stats->multicast = rtnl_stats->multicast;
3896 stats->collisions = rtnl_stats->collisions;
3897 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3898 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3899 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3900 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3901 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3902 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3903 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3904 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3905 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3906 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3907 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3909 ofpbuf_delete(reply);
3915 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3917 static const char fn[] = "/proc/net/dev";
3922 stream = fopen(fn, "r");
3924 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3929 while (fgets(line, sizeof line, stream)) {
3932 #define X64 "%"SCNu64
3935 X64 X64 X64 X64 X64 X64 X64 "%*u"
3936 X64 X64 X64 X64 X64 X64 X64 "%*u",
3942 &stats->rx_fifo_errors,
3943 &stats->rx_frame_errors,
3949 &stats->tx_fifo_errors,
3951 &stats->tx_carrier_errors) != 15) {
3952 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3953 } else if (!strcmp(devname, netdev_name)) {
3954 stats->rx_length_errors = UINT64_MAX;
3955 stats->rx_over_errors = UINT64_MAX;
3956 stats->rx_crc_errors = UINT64_MAX;
3957 stats->rx_missed_errors = UINT64_MAX;
3958 stats->tx_aborted_errors = UINT64_MAX;
3959 stats->tx_heartbeat_errors = UINT64_MAX;
3960 stats->tx_window_errors = UINT64_MAX;
3966 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3972 get_flags(const struct netdev *netdev, int *flags)
3977 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3979 *flags = ifr.ifr_flags;
3984 set_flags(struct netdev *netdev, int flags)
3988 ifr.ifr_flags = flags;
3989 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3994 do_get_ifindex(const char *netdev_name)
3998 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3999 COVERAGE_INC(netdev_get_ifindex);
4000 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4001 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4002 netdev_name, strerror(errno));
4005 return ifr.ifr_ifindex;
4009 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4011 struct netdev_dev_linux *netdev_dev =
4012 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4014 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4015 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4019 netdev_dev->cache_valid |= VALID_IFINDEX;
4020 netdev_dev->ifindex = ifindex;
4022 *ifindexp = netdev_dev->ifindex;
4027 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4032 memset(&ifr, 0, sizeof ifr);
4033 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4034 COVERAGE_INC(netdev_get_hwaddr);
4035 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4036 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4037 netdev_name, strerror(errno));
4040 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4041 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4042 VLOG_WARN("%s device has unknown hardware address family %d",
4043 netdev_name, hwaddr_family);
4045 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4050 set_etheraddr(const char *netdev_name, int hwaddr_family,
4051 const uint8_t mac[ETH_ADDR_LEN])
4055 memset(&ifr, 0, sizeof ifr);
4056 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4057 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4058 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4059 COVERAGE_INC(netdev_set_hwaddr);
4060 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4061 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4062 netdev_name, strerror(errno));
4069 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4070 int cmd, const char *cmd_name)
4074 memset(&ifr, 0, sizeof ifr);
4075 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4076 ifr.ifr_data = (caddr_t) ecmd;
4079 COVERAGE_INC(netdev_ethtool);
4080 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4083 if (errno != EOPNOTSUPP) {
4084 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4085 "failed: %s", cmd_name, name, strerror(errno));
4087 /* The device doesn't support this operation. That's pretty
4088 * common, so there's no point in logging anything. */
4095 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4096 const char *cmd_name)
4098 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4099 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4100 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4108 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4109 int cmd, const char *cmd_name)
4114 ifr.ifr_addr.sa_family = AF_INET;
4115 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4117 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4118 *ip = sin->sin_addr;