2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/mii.h>
29 #include <linux/pkt_sched.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sockios.h>
32 #include <linux/version.h>
33 #include <sys/types.h>
34 #include <sys/ioctl.h>
35 #include <sys/socket.h>
36 #include <netpacket/packet.h>
37 #include <net/ethernet.h>
39 #include <linux/if_tunnel.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
50 #include "dpif-linux.h"
51 #include "dynamic-string.h"
52 #include "fatal-signal.h"
55 #include "netdev-provider.h"
56 #include "netdev-vport.h"
58 #include "netlink-socket.h"
60 #include "openflow/openflow.h"
62 #include "poll-loop.h"
63 #include "rtnetlink.h"
64 #include "rtnetlink-link.h"
65 #include "socket-util.h"
70 VLOG_DEFINE_THIS_MODULE(netdev_linux);
72 COVERAGE_DEFINE(netdev_get_vlan_vid);
73 COVERAGE_DEFINE(netdev_set_policing);
74 COVERAGE_DEFINE(netdev_arp_lookup);
75 COVERAGE_DEFINE(netdev_get_ifindex);
76 COVERAGE_DEFINE(netdev_get_hwaddr);
77 COVERAGE_DEFINE(netdev_set_hwaddr);
78 COVERAGE_DEFINE(netdev_ethtool);
80 /* These were introduced in Linux 2.6.14, so they might be missing if we have
82 #ifndef ADVERTISED_Pause
83 #define ADVERTISED_Pause (1 << 13)
85 #ifndef ADVERTISED_Asym_Pause
86 #define ADVERTISED_Asym_Pause (1 << 14)
89 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
92 #define TC_RTAB_SIZE 1024
95 static struct rtnetlink_notifier netdev_linux_cache_notifier;
96 static int cache_notifier_refcount;
99 VALID_IFINDEX = 1 << 0,
100 VALID_ETHERADDR = 1 << 1,
104 VALID_CARRIER = 1 << 5,
105 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
106 VALID_POLICING = 1 << 7,
107 VALID_HAVE_VPORT_STATS = 1 << 8
115 /* Traffic control. */
117 /* An instance of a traffic control class. Always associated with a particular
120 * Each TC implementation subclasses this with whatever additional data it
123 const struct tc_ops *ops;
124 struct hmap queues; /* Contains "struct tc_queue"s.
125 * Read by generic TC layer.
126 * Written only by TC implementation. */
129 /* One traffic control queue.
131 * Each TC implementation subclasses this with whatever additional data it
134 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
135 unsigned int queue_id; /* OpenFlow queue ID. */
138 /* A particular kind of traffic control. Each implementation generally maps to
139 * one particular Linux qdisc class.
141 * The functions below return 0 if successful or a positive errno value on
142 * failure, except where otherwise noted. All of them must be provided, except
143 * where otherwise noted. */
145 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
146 * This is null for tc_ops_default and tc_ops_other, for which there are no
147 * appropriate values. */
148 const char *linux_name;
150 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
151 const char *ovs_name;
153 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
154 * queues. The queues are numbered 0 through n_queues - 1. */
155 unsigned int n_queues;
157 /* Called to install this TC class on 'netdev'. The implementation should
158 * make the Netlink calls required to set up 'netdev' with the right qdisc
159 * and configure it according to 'details'. The implementation may assume
160 * that the current qdisc is the default; that is, there is no need for it
161 * to delete the current qdisc before installing itself.
163 * The contents of 'details' should be documented as valid for 'ovs_name'
164 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
165 * (which is built as ovs-vswitchd.conf.db(8)).
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'.
170 * (This function is null for tc_ops_other, which cannot be installed. For
171 * other TC classes it should always be nonnull.) */
172 int (*tc_install)(struct netdev *netdev, const struct shash *details);
174 /* Called when the netdev code determines (through a Netlink query) that
175 * this TC class's qdisc is installed on 'netdev', but we didn't install
176 * it ourselves and so don't know any of the details.
178 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
179 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
180 * implementation should parse the other attributes of 'nlmsg' as
181 * necessary to determine its configuration. If necessary it should also
182 * use Netlink queries to determine the configuration of queues on
185 * This function must return 0 if and only if it sets 'netdev->tc' to an
186 * initialized 'struct tc'. */
187 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
189 /* Destroys the data structures allocated by the implementation as part of
190 * 'tc'. (This includes destroying 'tc->queues' by calling
193 * The implementation should not need to perform any Netlink calls. If
194 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
195 * (But it may not be desirable.)
197 * This function may be null if 'tc' is trivial. */
198 void (*tc_destroy)(struct tc *tc);
200 /* Retrieves details of 'netdev->tc' configuration into 'details'.
202 * The implementation should not need to perform any Netlink calls, because
203 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
204 * cached the configuration.
206 * The contents of 'details' should be documented as valid for 'ovs_name'
207 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
208 * (which is built as ovs-vswitchd.conf.db(8)).
210 * This function may be null if 'tc' is not configurable.
212 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
214 /* Reconfigures 'netdev->tc' according to 'details', performing any
215 * required Netlink calls to complete the reconfiguration.
217 * The contents of 'details' should be documented as valid for 'ovs_name'
218 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
219 * (which is built as ovs-vswitchd.conf.db(8)).
221 * This function may be null if 'tc' is not configurable.
223 int (*qdisc_set)(struct netdev *, const struct shash *details);
225 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
226 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
232 * The implementation should not need to perform any Netlink calls, because
233 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
234 * cached the queue configuration.
236 * This function may be null if 'tc' does not have queues ('n_queues' is
238 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
239 struct shash *details);
241 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
242 * 'details', perfoming any required Netlink calls to complete the
243 * reconfiguration. The caller ensures that 'queue_id' is less than
246 * The contents of 'details' should be documented as valid for 'ovs_name'
247 * in the "other_config" column in the "Queue" table in
248 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
250 * This function may be null if 'tc' does not have queues or its queues are
251 * not configurable. */
252 int (*class_set)(struct netdev *, unsigned int queue_id,
253 const struct shash *details);
255 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
256 * tc_queue's within 'netdev->tc->queues'.
258 * This function may be null if 'tc' does not have queues or its queues
259 * cannot be deleted. */
260 int (*class_delete)(struct netdev *, struct tc_queue *queue);
262 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
263 * 'struct tc_queue's within 'netdev->tc->queues'.
265 * On success, initializes '*stats'.
267 * This function may be null if 'tc' does not have queues or if it cannot
268 * report queue statistics. */
269 int (*class_get_stats)(const struct netdev *netdev,
270 const struct tc_queue *queue,
271 struct netdev_queue_stats *stats);
273 /* Extracts queue stats from 'nlmsg', which is a response to a
274 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_dump_stats)(const struct netdev *netdev,
279 const struct ofpbuf *nlmsg,
280 netdev_dump_queue_stats_cb *cb, void *aux);
284 tc_init(struct tc *tc, const struct tc_ops *ops)
287 hmap_init(&tc->queues);
291 tc_destroy(struct tc *tc)
293 hmap_destroy(&tc->queues);
296 static const struct tc_ops tc_ops_htb;
297 static const struct tc_ops tc_ops_hfsc;
298 static const struct tc_ops tc_ops_default;
299 static const struct tc_ops tc_ops_other;
301 static const struct tc_ops *tcs[] = {
302 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
303 &tc_ops_hfsc, /* Hierarchical fair service curve. */
304 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
305 &tc_ops_other, /* Some other qdisc. */
309 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
310 static unsigned int tc_get_major(unsigned int handle);
311 static unsigned int tc_get_minor(unsigned int handle);
313 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
314 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
315 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
317 static struct tcmsg *tc_make_request(const struct netdev *, int type,
318 unsigned int flags, struct ofpbuf *);
319 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
321 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
322 struct nlattr **options);
323 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
324 struct nlattr **options,
325 struct netdev_queue_stats *);
326 static int tc_query_class(const struct netdev *,
327 unsigned int handle, unsigned int parent,
328 struct ofpbuf **replyp);
329 static int tc_delete_class(const struct netdev *, unsigned int handle);
331 static int tc_del_qdisc(struct netdev *netdev);
332 static int tc_query_qdisc(const struct netdev *netdev);
334 static int tc_calc_cell_log(unsigned int mtu);
335 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
336 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
337 const struct tc_ratespec *rate);
338 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
340 struct netdev_dev_linux {
341 struct netdev_dev netdev_dev;
343 struct shash_node *shash_node;
344 unsigned int cache_valid;
346 /* The following are figured out "on demand" only. They are only valid
347 * when the corresponding VALID_* bit in 'cache_valid' is set. */
349 uint8_t etheraddr[ETH_ADDR_LEN];
350 struct in_addr address, netmask;
354 bool is_internal; /* Is this an openvswitch internal device? */
355 bool is_tap; /* Is this a tuntap device? */
356 uint32_t kbits_rate; /* Policing data. */
357 uint32_t kbits_burst;
358 bool have_vport_stats;
362 struct tap_state tap;
366 struct netdev_linux {
367 struct netdev netdev;
371 /* Sockets used for ioctl operations. */
372 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
373 static int af_packet_sock = -1; /* AF_PACKET, SOCK_RAW. */
375 /* A Netlink routing socket that is not subscribed to any multicast groups. */
376 static struct nl_sock *rtnl_sock;
378 struct netdev_linux_notifier {
379 struct netdev_notifier notifier;
383 static struct shash netdev_linux_notifiers =
384 SHASH_INITIALIZER(&netdev_linux_notifiers);
385 static struct rtnetlink_notifier netdev_linux_poll_notifier;
387 /* This is set pretty low because we probably won't learn anything from the
388 * additional log messages. */
389 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
391 static int netdev_linux_init(void);
393 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
394 int cmd, const char *cmd_name);
395 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
396 const char *cmd_name);
397 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
398 int cmd, const char *cmd_name);
399 static int get_flags(const struct netdev *, int *flagsp);
400 static int set_flags(struct netdev *, int flags);
401 static int do_get_ifindex(const char *netdev_name);
402 static int get_ifindex(const struct netdev *, int *ifindexp);
403 static int do_set_addr(struct netdev *netdev,
404 int ioctl_nr, const char *ioctl_name,
405 struct in_addr addr);
406 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
407 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
408 const uint8_t[ETH_ADDR_LEN]);
409 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
410 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
413 is_netdev_linux_class(const struct netdev_class *netdev_class)
415 return netdev_class->init == netdev_linux_init;
418 static struct netdev_dev_linux *
419 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
421 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
422 assert(is_netdev_linux_class(netdev_class));
424 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
427 static struct netdev_linux *
428 netdev_linux_cast(const struct netdev *netdev)
430 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
431 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
432 assert(is_netdev_linux_class(netdev_class));
434 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
438 netdev_linux_init(void)
440 static int status = -1;
442 /* Create AF_INET socket. */
443 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
444 status = af_inet_sock >= 0 ? 0 : errno;
446 VLOG_ERR("failed to create inet socket: %s", strerror(status));
448 /* Create AF_PACKET socket. */
449 af_packet_sock = socket(AF_PACKET, SOCK_RAW, 0);
450 status = af_packet_sock >= 0 ? 0 : errno;
452 VLOG_ERR("failed to create packet socket: %s",
457 /* Create rtnetlink socket. */
459 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
461 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
470 netdev_linux_run(void)
472 rtnetlink_link_notifier_run();
476 netdev_linux_wait(void)
478 rtnetlink_link_notifier_wait();
482 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
483 void *aux OVS_UNUSED)
485 struct netdev_dev_linux *dev;
487 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
489 const struct netdev_class *netdev_class =
490 netdev_dev_get_class(base_dev);
492 if (is_netdev_linux_class(netdev_class)) {
493 dev = netdev_dev_linux_cast(base_dev);
494 dev->cache_valid = 0;
498 struct shash device_shash;
499 struct shash_node *node;
501 shash_init(&device_shash);
502 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
503 SHASH_FOR_EACH (node, &device_shash) {
505 dev->cache_valid = 0;
507 shash_destroy(&device_shash);
511 /* Creates system and internal devices. */
513 netdev_linux_create(const struct netdev_class *class,
514 const char *name, const struct shash *args,
515 struct netdev_dev **netdev_devp)
517 struct netdev_dev_linux *netdev_dev;
520 if (!shash_is_empty(args)) {
521 VLOG_WARN("%s: arguments for %s devices should be empty",
525 if (!cache_notifier_refcount) {
526 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
527 netdev_linux_cache_cb, NULL);
532 cache_notifier_refcount++;
534 netdev_dev = xzalloc(sizeof *netdev_dev);
535 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
537 *netdev_devp = &netdev_dev->netdev_dev;
541 /* For most types of netdevs we open the device for each call of
542 * netdev_open(). However, this is not the case with tap devices,
543 * since it is only possible to open the device once. In this
544 * situation we share a single file descriptor, and consequently
545 * buffers, across all readers. Therefore once data is read it will
546 * be unavailable to other reads for tap devices. */
548 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
549 const char *name, const struct shash *args,
550 struct netdev_dev **netdev_devp)
552 struct netdev_dev_linux *netdev_dev;
553 struct tap_state *state;
554 static const char tap_dev[] = "/dev/net/tun";
558 if (!shash_is_empty(args)) {
559 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
562 netdev_dev = xzalloc(sizeof *netdev_dev);
563 state = &netdev_dev->state.tap;
565 /* Open tap device. */
566 state->fd = open(tap_dev, O_RDWR);
569 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
573 /* Create tap device. */
574 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
575 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
576 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
577 VLOG_WARN("%s: creating tap device failed: %s", name,
583 /* Make non-blocking. */
584 error = set_nonblocking(state->fd);
589 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
590 *netdev_devp = &netdev_dev->netdev_dev;
599 destroy_tap(struct netdev_dev_linux *netdev_dev)
601 struct tap_state *state = &netdev_dev->state.tap;
603 if (state->fd >= 0) {
608 /* Destroys the netdev device 'netdev_dev_'. */
610 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
612 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
613 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
615 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
616 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
619 if (class == &netdev_linux_class || class == &netdev_internal_class) {
620 cache_notifier_refcount--;
622 if (!cache_notifier_refcount) {
623 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
625 } else if (class == &netdev_tap_class) {
626 destroy_tap(netdev_dev);
635 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
636 struct netdev **netdevp)
638 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
639 struct netdev_linux *netdev;
640 enum netdev_flags flags;
643 /* Allocate network device. */
644 netdev = xzalloc(sizeof *netdev);
646 netdev_init(&netdev->netdev, netdev_dev_);
648 /* Verify that the device really exists, by attempting to read its flags.
649 * (The flags might be cached, in which case this won't actually do an
652 * Don't do this for "internal" netdevs, though, because those have to be
653 * created as netdev objects before they exist in the kernel, because
654 * creating them in the kernel happens by passing a netdev object to
655 * dpif_port_add(). */
656 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
657 error = netdev_get_flags(&netdev->netdev, &flags);
658 if (error == ENODEV) {
663 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
664 !netdev_dev->state.tap.opened) {
666 /* We assume that the first user of the tap device is the primary user
667 * and give them the tap FD. Subsequent users probably just expect
668 * this to be a system device so open it normally to avoid send/receive
669 * directions appearing to be reversed. */
670 netdev->fd = netdev_dev->state.tap.fd;
671 netdev_dev->state.tap.opened = true;
672 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
673 struct sockaddr_ll sll;
677 /* Create file descriptor. */
678 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
679 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
681 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
682 if (netdev->fd < 0) {
687 /* Set non-blocking mode. */
688 error = set_nonblocking(netdev->fd);
693 /* Get ethernet device index. */
694 error = get_ifindex(&netdev->netdev, &ifindex);
699 /* Bind to specific ethernet device. */
700 memset(&sll, 0, sizeof sll);
701 sll.sll_family = AF_PACKET;
702 sll.sll_ifindex = ifindex;
704 (struct sockaddr *) &sll, sizeof sll) < 0) {
706 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
711 /* Between the socket() and bind() calls above, the socket receives all
712 * packets of the requested type on all system interfaces. We do not
713 * want to receive that data, but there is no way to avoid it. So we
714 * must now drain out the receive queue. */
715 error = drain_rcvbuf(netdev->fd);
721 *netdevp = &netdev->netdev;
725 netdev_uninit(&netdev->netdev, true);
729 /* Closes and destroys 'netdev'. */
731 netdev_linux_close(struct netdev *netdev_)
733 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
735 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
741 /* Initializes 'sset' with a list of the names of all known network devices. */
743 netdev_linux_enumerate(struct sset *sset)
745 struct if_nameindex *names;
747 names = if_nameindex();
751 for (i = 0; names[i].if_name != NULL; i++) {
752 sset_add(sset, names[i].if_name);
754 if_freenameindex(names);
757 VLOG_WARN("could not obtain list of network device names: %s",
764 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
766 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
768 if (netdev->fd < 0) {
769 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
774 ssize_t retval = read(netdev->fd, data, size);
777 } else if (errno != EINTR) {
778 if (errno != EAGAIN) {
779 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
780 strerror(errno), netdev_get_name(netdev_));
787 /* Registers with the poll loop to wake up from the next call to poll_block()
788 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
790 netdev_linux_recv_wait(struct netdev *netdev_)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
793 if (netdev->fd >= 0) {
794 poll_fd_wait(netdev->fd, POLLIN);
798 /* Discards all packets waiting to be received from 'netdev'. */
800 netdev_linux_drain(struct netdev *netdev_)
802 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
803 if (netdev->fd < 0) {
805 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
807 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
808 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
812 drain_fd(netdev->fd, ifr.ifr_qlen);
815 return drain_rcvbuf(netdev->fd);
819 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
820 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
821 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
822 * the packet is too big or too small to transmit on the device.
824 * The caller retains ownership of 'buffer' in all cases.
826 * The kernel maintains a packet transmission queue, so the caller is not
827 * expected to do additional queuing of packets. */
829 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
831 struct sockaddr_ll sll;
837 error = get_ifindex(netdev_, &ifindex);
842 /* We don't bother setting most fields in sockaddr_ll because the kernel
843 * ignores them for SOCK_RAW. */
844 memset(&sll, 0, sizeof sll);
845 sll.sll_family = AF_PACKET;
846 sll.sll_ifindex = ifindex;
848 iov.iov_base = (void *) data;
852 msg.msg_namelen = sizeof sll;
855 msg.msg_control = NULL;
856 msg.msg_controllen = 0;
860 ssize_t retval = sendmsg(af_packet_sock, &msg, 0);
862 /* The Linux AF_PACKET implementation never blocks waiting for room
863 * for packets, instead returning ENOBUFS. Translate this into
864 * EAGAIN for the caller. */
865 if (errno == ENOBUFS) {
867 } else if (errno == EINTR) {
869 } else if (errno != EAGAIN) {
870 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
871 netdev_get_name(netdev_), strerror(errno));
874 } else if (retval != size) {
875 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
876 "%zu) on %s", retval, size, netdev_get_name(netdev_));
884 /* Registers with the poll loop to wake up from the next call to poll_block()
885 * when the packet transmission queue has sufficient room to transmit a packet
886 * with netdev_send().
888 * The kernel maintains a packet transmission queue, so the client is not
889 * expected to do additional queuing of packets. Thus, this function is
890 * unlikely to ever be used. It is included for completeness. */
892 netdev_linux_send_wait(struct netdev *netdev_)
894 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
895 if (netdev->fd < 0) {
897 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
898 poll_fd_wait(netdev->fd, POLLOUT);
900 /* TAP device always accepts packets.*/
901 poll_immediate_wake();
905 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
906 * otherwise a positive errno value. */
908 netdev_linux_set_etheraddr(struct netdev *netdev_,
909 const uint8_t mac[ETH_ADDR_LEN])
911 struct netdev_dev_linux *netdev_dev =
912 netdev_dev_linux_cast(netdev_get_dev(netdev_));
915 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
916 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
917 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
919 netdev_dev->cache_valid |= VALID_ETHERADDR;
920 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
928 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
929 * free the returned buffer. */
931 netdev_linux_get_etheraddr(const struct netdev *netdev_,
932 uint8_t mac[ETH_ADDR_LEN])
934 struct netdev_dev_linux *netdev_dev =
935 netdev_dev_linux_cast(netdev_get_dev(netdev_));
936 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
937 int error = get_etheraddr(netdev_get_name(netdev_),
938 netdev_dev->etheraddr);
942 netdev_dev->cache_valid |= VALID_ETHERADDR;
944 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
948 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
949 * in bytes, not including the hardware header; thus, this is typically 1500
950 * bytes for Ethernet devices. */
952 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
954 struct netdev_dev_linux *netdev_dev =
955 netdev_dev_linux_cast(netdev_get_dev(netdev_));
956 if (!(netdev_dev->cache_valid & VALID_MTU)) {
960 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
961 SIOCGIFMTU, "SIOCGIFMTU");
965 netdev_dev->mtu = ifr.ifr_mtu;
966 netdev_dev->cache_valid |= VALID_MTU;
968 *mtup = netdev_dev->mtu;
972 /* Returns the ifindex of 'netdev', if successful, as a positive number.
973 * On failure, returns a negative errno value. */
975 netdev_linux_get_ifindex(const struct netdev *netdev)
979 error = get_ifindex(netdev, &ifindex);
980 return error ? -error : ifindex;
984 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
986 struct netdev_dev_linux *netdev_dev =
987 netdev_dev_linux_cast(netdev_get_dev(netdev_));
992 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
996 fn = xasprintf("/sys/class/net/%s/carrier",
997 netdev_get_name(netdev_));
998 fd = open(fn, O_RDONLY);
1001 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1005 retval = read(fd, line, sizeof line);
1008 if (error == EINVAL) {
1009 /* This is the normal return value when we try to check carrier
1010 * if the network device is not up. */
1012 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1015 } else if (retval == 0) {
1017 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1021 if (line[0] != '0' && line[0] != '1') {
1023 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1027 netdev_dev->carrier = line[0] != '0';
1028 netdev_dev->cache_valid |= VALID_CARRIER;
1030 *carrier = netdev_dev->carrier;
1042 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1043 const char *cmd_name, struct mii_ioctl_data *data)
1048 memset(&ifr, 0, sizeof ifr);
1049 memcpy(&ifr.ifr_data, data, sizeof *data);
1050 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1051 &ifr, cmd, cmd_name);
1052 memcpy(data, &ifr.ifr_data, sizeof *data);
1058 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1060 const char *name = netdev_get_name(netdev);
1061 struct mii_ioctl_data data;
1066 memset(&data, 0, sizeof data);
1067 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1069 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1070 data.reg_num = MII_BMSR;
1071 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1075 *miimon = !!(data.val_out & BMSR_LSTATUS);
1077 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1080 struct ethtool_cmd ecmd;
1082 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1085 memset(&ecmd, 0, sizeof ecmd);
1086 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1089 struct ethtool_value eval;
1091 memcpy(&eval, &ecmd, sizeof eval);
1092 *miimon = !!eval.data;
1094 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1101 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1102 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1105 check_for_working_netlink_stats(void)
1107 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1108 * preferable, so if that works, we'll use it. */
1109 int ifindex = do_get_ifindex("lo");
1111 VLOG_WARN("failed to get ifindex for lo, "
1112 "obtaining netdev stats from proc");
1115 struct netdev_stats stats;
1116 int error = get_stats_via_netlink(ifindex, &stats);
1118 VLOG_DBG("obtaining netdev stats via rtnetlink");
1121 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1122 "via proc (you are probably running a pre-2.6.19 "
1123 "kernel)", strerror(error));
1129 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1131 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1133 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1134 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1135 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1137 netdev_dev->is_tap = !strcmp(type, "tap");
1138 netdev_dev->is_internal = (!netdev_dev->is_tap
1139 && dpif_linux_is_internal_device(name));
1140 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1145 swap_uint64(uint64_t *a, uint64_t *b)
1152 /* Retrieves current device stats for 'netdev'. */
1154 netdev_linux_get_stats(const struct netdev *netdev_,
1155 struct netdev_stats *stats)
1157 struct netdev_dev_linux *netdev_dev =
1158 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1159 static int use_netlink_stats = -1;
1162 if (netdev_dev->have_vport_stats ||
1163 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1165 error = netdev_vport_get_stats(netdev_, stats);
1166 netdev_dev->have_vport_stats = !error;
1167 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1170 if (!netdev_dev->have_vport_stats) {
1171 if (use_netlink_stats < 0) {
1172 use_netlink_stats = check_for_working_netlink_stats();
1174 if (use_netlink_stats) {
1177 error = get_ifindex(netdev_, &ifindex);
1179 error = get_stats_via_netlink(ifindex, stats);
1182 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1186 /* If this port is an internal port then the transmit and receive stats
1187 * will appear to be swapped relative to the other ports since we are the
1188 * one sending the data, not a remote computer. For consistency, we swap
1189 * them back here. This does not apply if we are getting stats from the
1190 * vport layer because it always tracks stats from the perspective of the
1192 netdev_linux_update_is_pseudo(netdev_dev);
1193 if (!error && !netdev_dev->have_vport_stats &&
1194 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1195 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1196 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1197 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1198 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1199 stats->rx_length_errors = 0;
1200 stats->rx_over_errors = 0;
1201 stats->rx_crc_errors = 0;
1202 stats->rx_frame_errors = 0;
1203 stats->rx_fifo_errors = 0;
1204 stats->rx_missed_errors = 0;
1205 stats->tx_aborted_errors = 0;
1206 stats->tx_carrier_errors = 0;
1207 stats->tx_fifo_errors = 0;
1208 stats->tx_heartbeat_errors = 0;
1209 stats->tx_window_errors = 0;
1215 /* Stores the features supported by 'netdev' into each of '*current',
1216 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1217 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1218 * successful, otherwise a positive errno value. */
1220 netdev_linux_get_features(const struct netdev *netdev,
1221 uint32_t *current, uint32_t *advertised,
1222 uint32_t *supported, uint32_t *peer)
1224 struct ethtool_cmd ecmd;
1227 memset(&ecmd, 0, sizeof ecmd);
1228 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1229 ETHTOOL_GSET, "ETHTOOL_GSET");
1234 /* Supported features. */
1236 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1237 *supported |= OFPPF_10MB_HD;
1239 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1240 *supported |= OFPPF_10MB_FD;
1242 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1243 *supported |= OFPPF_100MB_HD;
1245 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1246 *supported |= OFPPF_100MB_FD;
1248 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1249 *supported |= OFPPF_1GB_HD;
1251 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1252 *supported |= OFPPF_1GB_FD;
1254 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1255 *supported |= OFPPF_10GB_FD;
1257 if (ecmd.supported & SUPPORTED_TP) {
1258 *supported |= OFPPF_COPPER;
1260 if (ecmd.supported & SUPPORTED_FIBRE) {
1261 *supported |= OFPPF_FIBER;
1263 if (ecmd.supported & SUPPORTED_Autoneg) {
1264 *supported |= OFPPF_AUTONEG;
1266 if (ecmd.supported & SUPPORTED_Pause) {
1267 *supported |= OFPPF_PAUSE;
1269 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1270 *supported |= OFPPF_PAUSE_ASYM;
1273 /* Advertised features. */
1275 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1276 *advertised |= OFPPF_10MB_HD;
1278 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1279 *advertised |= OFPPF_10MB_FD;
1281 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1282 *advertised |= OFPPF_100MB_HD;
1284 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1285 *advertised |= OFPPF_100MB_FD;
1287 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1288 *advertised |= OFPPF_1GB_HD;
1290 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1291 *advertised |= OFPPF_1GB_FD;
1293 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1294 *advertised |= OFPPF_10GB_FD;
1296 if (ecmd.advertising & ADVERTISED_TP) {
1297 *advertised |= OFPPF_COPPER;
1299 if (ecmd.advertising & ADVERTISED_FIBRE) {
1300 *advertised |= OFPPF_FIBER;
1302 if (ecmd.advertising & ADVERTISED_Autoneg) {
1303 *advertised |= OFPPF_AUTONEG;
1305 if (ecmd.advertising & ADVERTISED_Pause) {
1306 *advertised |= OFPPF_PAUSE;
1308 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1309 *advertised |= OFPPF_PAUSE_ASYM;
1312 /* Current settings. */
1313 if (ecmd.speed == SPEED_10) {
1314 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1315 } else if (ecmd.speed == SPEED_100) {
1316 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1317 } else if (ecmd.speed == SPEED_1000) {
1318 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1319 } else if (ecmd.speed == SPEED_10000) {
1320 *current = OFPPF_10GB_FD;
1325 if (ecmd.port == PORT_TP) {
1326 *current |= OFPPF_COPPER;
1327 } else if (ecmd.port == PORT_FIBRE) {
1328 *current |= OFPPF_FIBER;
1332 *current |= OFPPF_AUTONEG;
1335 /* Peer advertisements. */
1336 *peer = 0; /* XXX */
1341 /* Set the features advertised by 'netdev' to 'advertise'. */
1343 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1345 struct ethtool_cmd ecmd;
1348 memset(&ecmd, 0, sizeof ecmd);
1349 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1350 ETHTOOL_GSET, "ETHTOOL_GSET");
1355 ecmd.advertising = 0;
1356 if (advertise & OFPPF_10MB_HD) {
1357 ecmd.advertising |= ADVERTISED_10baseT_Half;
1359 if (advertise & OFPPF_10MB_FD) {
1360 ecmd.advertising |= ADVERTISED_10baseT_Full;
1362 if (advertise & OFPPF_100MB_HD) {
1363 ecmd.advertising |= ADVERTISED_100baseT_Half;
1365 if (advertise & OFPPF_100MB_FD) {
1366 ecmd.advertising |= ADVERTISED_100baseT_Full;
1368 if (advertise & OFPPF_1GB_HD) {
1369 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1371 if (advertise & OFPPF_1GB_FD) {
1372 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1374 if (advertise & OFPPF_10GB_FD) {
1375 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1377 if (advertise & OFPPF_COPPER) {
1378 ecmd.advertising |= ADVERTISED_TP;
1380 if (advertise & OFPPF_FIBER) {
1381 ecmd.advertising |= ADVERTISED_FIBRE;
1383 if (advertise & OFPPF_AUTONEG) {
1384 ecmd.advertising |= ADVERTISED_Autoneg;
1386 if (advertise & OFPPF_PAUSE) {
1387 ecmd.advertising |= ADVERTISED_Pause;
1389 if (advertise & OFPPF_PAUSE_ASYM) {
1390 ecmd.advertising |= ADVERTISED_Asym_Pause;
1392 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1393 ETHTOOL_SSET, "ETHTOOL_SSET");
1396 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1397 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1398 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1399 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1400 * sets '*vlan_vid' to -1. */
1402 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1404 const char *netdev_name = netdev_get_name(netdev);
1405 struct ds line = DS_EMPTY_INITIALIZER;
1406 FILE *stream = NULL;
1410 COVERAGE_INC(netdev_get_vlan_vid);
1411 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1412 stream = fopen(fn, "r");
1418 if (ds_get_line(&line, stream)) {
1419 if (ferror(stream)) {
1421 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1424 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1429 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1431 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1432 fn, ds_cstr(&line));
1450 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1451 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1453 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1454 * positive errno value.
1456 * This function is equivalent to running
1457 * /sbin/tc qdisc del dev %s handle ffff: ingress
1458 * but it is much, much faster.
1461 netdev_linux_remove_policing(struct netdev *netdev)
1463 struct netdev_dev_linux *netdev_dev =
1464 netdev_dev_linux_cast(netdev_get_dev(netdev));
1465 const char *netdev_name = netdev_get_name(netdev);
1467 struct ofpbuf request;
1468 struct tcmsg *tcmsg;
1471 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1475 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1476 tcmsg->tcm_parent = TC_H_INGRESS;
1477 nl_msg_put_string(&request, TCA_KIND, "ingress");
1478 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1480 error = tc_transact(&request, NULL);
1481 if (error && error != ENOENT && error != EINVAL) {
1482 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1483 netdev_name, strerror(error));
1487 netdev_dev->kbits_rate = 0;
1488 netdev_dev->kbits_burst = 0;
1489 netdev_dev->cache_valid |= VALID_POLICING;
1493 /* Attempts to set input rate limiting (policing) policy. */
1495 netdev_linux_set_policing(struct netdev *netdev,
1496 uint32_t kbits_rate, uint32_t kbits_burst)
1498 struct netdev_dev_linux *netdev_dev =
1499 netdev_dev_linux_cast(netdev_get_dev(netdev));
1500 const char *netdev_name = netdev_get_name(netdev);
1503 COVERAGE_INC(netdev_set_policing);
1505 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1506 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1507 : kbits_burst); /* Stick with user-specified value. */
1509 if (netdev_dev->cache_valid & VALID_POLICING
1510 && netdev_dev->kbits_rate == kbits_rate
1511 && netdev_dev->kbits_burst == kbits_burst) {
1512 /* Assume that settings haven't changed since we last set them. */
1516 netdev_linux_remove_policing(netdev);
1518 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1519 if (system(command) != 0) {
1520 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1524 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1525 kbits_rate, kbits_burst);
1526 if (system(command) != 0) {
1527 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1532 netdev_dev->kbits_rate = kbits_rate;
1533 netdev_dev->kbits_burst = kbits_burst;
1534 netdev_dev->cache_valid |= VALID_POLICING;
1541 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1544 const struct tc_ops **opsp;
1546 for (opsp = tcs; *opsp != NULL; opsp++) {
1547 const struct tc_ops *ops = *opsp;
1548 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1549 sset_add(types, ops->ovs_name);
1555 static const struct tc_ops *
1556 tc_lookup_ovs_name(const char *name)
1558 const struct tc_ops **opsp;
1560 for (opsp = tcs; *opsp != NULL; opsp++) {
1561 const struct tc_ops *ops = *opsp;
1562 if (!strcmp(name, ops->ovs_name)) {
1569 static const struct tc_ops *
1570 tc_lookup_linux_name(const char *name)
1572 const struct tc_ops **opsp;
1574 for (opsp = tcs; *opsp != NULL; opsp++) {
1575 const struct tc_ops *ops = *opsp;
1576 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1583 static struct tc_queue *
1584 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1587 struct netdev_dev_linux *netdev_dev =
1588 netdev_dev_linux_cast(netdev_get_dev(netdev));
1589 struct tc_queue *queue;
1591 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1592 if (queue->queue_id == queue_id) {
1599 static struct tc_queue *
1600 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1602 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1606 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1608 struct netdev_qos_capabilities *caps)
1610 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1614 caps->n_queues = ops->n_queues;
1619 netdev_linux_get_qos(const struct netdev *netdev,
1620 const char **typep, struct shash *details)
1622 struct netdev_dev_linux *netdev_dev =
1623 netdev_dev_linux_cast(netdev_get_dev(netdev));
1626 error = tc_query_qdisc(netdev);
1631 *typep = netdev_dev->tc->ops->ovs_name;
1632 return (netdev_dev->tc->ops->qdisc_get
1633 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1638 netdev_linux_set_qos(struct netdev *netdev,
1639 const char *type, const struct shash *details)
1641 struct netdev_dev_linux *netdev_dev =
1642 netdev_dev_linux_cast(netdev_get_dev(netdev));
1643 const struct tc_ops *new_ops;
1646 new_ops = tc_lookup_ovs_name(type);
1647 if (!new_ops || !new_ops->tc_install) {
1651 error = tc_query_qdisc(netdev);
1656 if (new_ops == netdev_dev->tc->ops) {
1657 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1659 /* Delete existing qdisc. */
1660 error = tc_del_qdisc(netdev);
1664 assert(netdev_dev->tc == NULL);
1666 /* Install new qdisc. */
1667 error = new_ops->tc_install(netdev, details);
1668 assert((error == 0) == (netdev_dev->tc != NULL));
1675 netdev_linux_get_queue(const struct netdev *netdev,
1676 unsigned int queue_id, struct shash *details)
1678 struct netdev_dev_linux *netdev_dev =
1679 netdev_dev_linux_cast(netdev_get_dev(netdev));
1682 error = tc_query_qdisc(netdev);
1686 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1688 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1694 netdev_linux_set_queue(struct netdev *netdev,
1695 unsigned int queue_id, const struct shash *details)
1697 struct netdev_dev_linux *netdev_dev =
1698 netdev_dev_linux_cast(netdev_get_dev(netdev));
1701 error = tc_query_qdisc(netdev);
1704 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1705 || !netdev_dev->tc->ops->class_set) {
1709 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1713 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1715 struct netdev_dev_linux *netdev_dev =
1716 netdev_dev_linux_cast(netdev_get_dev(netdev));
1719 error = tc_query_qdisc(netdev);
1722 } else if (!netdev_dev->tc->ops->class_delete) {
1725 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1727 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1733 netdev_linux_get_queue_stats(const struct netdev *netdev,
1734 unsigned int queue_id,
1735 struct netdev_queue_stats *stats)
1737 struct netdev_dev_linux *netdev_dev =
1738 netdev_dev_linux_cast(netdev_get_dev(netdev));
1741 error = tc_query_qdisc(netdev);
1744 } else if (!netdev_dev->tc->ops->class_get_stats) {
1747 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1749 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1755 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1757 struct ofpbuf request;
1758 struct tcmsg *tcmsg;
1760 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1764 tcmsg->tcm_parent = 0;
1765 nl_dump_start(dump, rtnl_sock, &request);
1766 ofpbuf_uninit(&request);
1771 netdev_linux_dump_queues(const struct netdev *netdev,
1772 netdev_dump_queues_cb *cb, void *aux)
1774 struct netdev_dev_linux *netdev_dev =
1775 netdev_dev_linux_cast(netdev_get_dev(netdev));
1776 struct tc_queue *queue;
1777 struct shash details;
1781 error = tc_query_qdisc(netdev);
1784 } else if (!netdev_dev->tc->ops->class_get) {
1789 shash_init(&details);
1790 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1791 shash_clear(&details);
1793 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1795 (*cb)(queue->queue_id, &details, aux);
1800 shash_destroy(&details);
1806 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1807 netdev_dump_queue_stats_cb *cb, void *aux)
1809 struct netdev_dev_linux *netdev_dev =
1810 netdev_dev_linux_cast(netdev_get_dev(netdev));
1811 struct nl_dump dump;
1816 error = tc_query_qdisc(netdev);
1819 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1824 if (!start_queue_dump(netdev, &dump)) {
1827 while (nl_dump_next(&dump, &msg)) {
1828 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1834 error = nl_dump_done(&dump);
1835 return error ? error : last_error;
1839 netdev_linux_get_in4(const struct netdev *netdev_,
1840 struct in_addr *address, struct in_addr *netmask)
1842 struct netdev_dev_linux *netdev_dev =
1843 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1845 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1848 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1849 SIOCGIFADDR, "SIOCGIFADDR");
1854 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1855 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1860 netdev_dev->cache_valid |= VALID_IN4;
1862 *address = netdev_dev->address;
1863 *netmask = netdev_dev->netmask;
1864 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1868 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1869 struct in_addr netmask)
1871 struct netdev_dev_linux *netdev_dev =
1872 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1875 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1877 netdev_dev->cache_valid |= VALID_IN4;
1878 netdev_dev->address = address;
1879 netdev_dev->netmask = netmask;
1880 if (address.s_addr != INADDR_ANY) {
1881 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1882 "SIOCSIFNETMASK", netmask);
1889 parse_if_inet6_line(const char *line,
1890 struct in6_addr *in6, char ifname[16 + 1])
1892 uint8_t *s6 = in6->s6_addr;
1893 #define X8 "%2"SCNx8
1895 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1896 "%*x %*x %*x %*x %16s\n",
1897 &s6[0], &s6[1], &s6[2], &s6[3],
1898 &s6[4], &s6[5], &s6[6], &s6[7],
1899 &s6[8], &s6[9], &s6[10], &s6[11],
1900 &s6[12], &s6[13], &s6[14], &s6[15],
1904 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1905 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1907 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1909 struct netdev_dev_linux *netdev_dev =
1910 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1911 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1915 netdev_dev->in6 = in6addr_any;
1917 file = fopen("/proc/net/if_inet6", "r");
1919 const char *name = netdev_get_name(netdev_);
1920 while (fgets(line, sizeof line, file)) {
1921 struct in6_addr in6_tmp;
1922 char ifname[16 + 1];
1923 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1924 && !strcmp(name, ifname))
1926 netdev_dev->in6 = in6_tmp;
1932 netdev_dev->cache_valid |= VALID_IN6;
1934 *in6 = netdev_dev->in6;
1939 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1941 struct sockaddr_in sin;
1942 memset(&sin, 0, sizeof sin);
1943 sin.sin_family = AF_INET;
1944 sin.sin_addr = addr;
1947 memset(sa, 0, sizeof *sa);
1948 memcpy(sa, &sin, sizeof sin);
1952 do_set_addr(struct netdev *netdev,
1953 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1956 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1957 make_in4_sockaddr(&ifr.ifr_addr, addr);
1959 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1963 /* Adds 'router' as a default IP gateway. */
1965 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1967 struct in_addr any = { INADDR_ANY };
1971 memset(&rt, 0, sizeof rt);
1972 make_in4_sockaddr(&rt.rt_dst, any);
1973 make_in4_sockaddr(&rt.rt_gateway, router);
1974 make_in4_sockaddr(&rt.rt_genmask, any);
1975 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1976 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1978 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1984 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1987 static const char fn[] = "/proc/net/route";
1992 *netdev_name = NULL;
1993 stream = fopen(fn, "r");
1994 if (stream == NULL) {
1995 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2000 while (fgets(line, sizeof line, stream)) {
2003 uint32_t dest, gateway, mask;
2004 int refcnt, metric, mtu;
2005 unsigned int flags, use, window, irtt;
2008 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2010 iface, &dest, &gateway, &flags, &refcnt,
2011 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2013 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2017 if (!(flags & RTF_UP)) {
2018 /* Skip routes that aren't up. */
2022 /* The output of 'dest', 'mask', and 'gateway' were given in
2023 * network byte order, so we don't need need any endian
2024 * conversions here. */
2025 if ((dest & mask) == (host->s_addr & mask)) {
2027 /* The host is directly reachable. */
2028 next_hop->s_addr = 0;
2030 /* To reach the host, we must go through a gateway. */
2031 next_hop->s_addr = gateway;
2033 *netdev_name = xstrdup(iface);
2045 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2047 struct ethtool_drvinfo drvinfo;
2050 memset(&drvinfo, 0, sizeof drvinfo);
2051 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2052 (struct ethtool_cmd *)&drvinfo,
2054 "ETHTOOL_GDRVINFO");
2056 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2057 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2058 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2064 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2065 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2066 * returns 0. Otherwise, it returns a positive errno value; in particular,
2067 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2069 netdev_linux_arp_lookup(const struct netdev *netdev,
2070 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2073 struct sockaddr_in sin;
2076 memset(&r, 0, sizeof r);
2077 memset(&sin, 0, sizeof sin);
2078 sin.sin_family = AF_INET;
2079 sin.sin_addr.s_addr = ip;
2081 memcpy(&r.arp_pa, &sin, sizeof sin);
2082 r.arp_ha.sa_family = ARPHRD_ETHER;
2084 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2085 COVERAGE_INC(netdev_arp_lookup);
2086 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2088 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2089 } else if (retval != ENXIO) {
2090 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2091 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2097 nd_to_iff_flags(enum netdev_flags nd)
2100 if (nd & NETDEV_UP) {
2103 if (nd & NETDEV_PROMISC) {
2110 iff_to_nd_flags(int iff)
2112 enum netdev_flags nd = 0;
2116 if (iff & IFF_PROMISC) {
2117 nd |= NETDEV_PROMISC;
2123 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2124 enum netdev_flags on, enum netdev_flags *old_flagsp)
2126 int old_flags, new_flags;
2129 error = get_flags(netdev, &old_flags);
2131 *old_flagsp = iff_to_nd_flags(old_flags);
2132 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2133 if (new_flags != old_flags) {
2134 error = set_flags(netdev, new_flags);
2141 poll_notify(struct list *list)
2143 struct netdev_linux_notifier *notifier;
2144 LIST_FOR_EACH (notifier, node, list) {
2145 struct netdev_notifier *n = ¬ifier->notifier;
2151 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2152 void *aux OVS_UNUSED)
2155 struct list *list = shash_find_data(&netdev_linux_notifiers,
2161 struct shash_node *node;
2162 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2163 poll_notify(node->data);
2169 netdev_linux_poll_add(struct netdev *netdev,
2170 void (*cb)(struct netdev_notifier *), void *aux,
2171 struct netdev_notifier **notifierp)
2173 const char *netdev_name = netdev_get_name(netdev);
2174 struct netdev_linux_notifier *notifier;
2177 if (shash_is_empty(&netdev_linux_notifiers)) {
2179 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2180 netdev_linux_poll_cb, NULL);
2186 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2188 list = xmalloc(sizeof *list);
2190 shash_add(&netdev_linux_notifiers, netdev_name, list);
2193 notifier = xmalloc(sizeof *notifier);
2194 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2195 list_push_back(list, ¬ifier->node);
2196 *notifierp = ¬ifier->notifier;
2201 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2203 struct netdev_linux_notifier *notifier =
2204 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2207 /* Remove 'notifier' from its list. */
2208 list = list_remove(¬ifier->node);
2209 if (list_is_empty(list)) {
2210 /* The list is now empty. Remove it from the hash and free it. */
2211 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2212 shash_delete(&netdev_linux_notifiers,
2213 shash_find(&netdev_linux_notifiers, netdev_name));
2218 /* If that was the last notifier, unregister. */
2219 if (shash_is_empty(&netdev_linux_notifiers)) {
2220 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2224 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2228 netdev_linux_init, \
2230 netdev_linux_wait, \
2233 netdev_linux_destroy, \
2234 NULL, /* set_config */ \
2236 netdev_linux_open, \
2237 netdev_linux_close, \
2241 netdev_linux_recv, \
2242 netdev_linux_recv_wait, \
2243 netdev_linux_drain, \
2245 netdev_linux_send, \
2246 netdev_linux_send_wait, \
2248 netdev_linux_set_etheraddr, \
2249 netdev_linux_get_etheraddr, \
2250 netdev_linux_get_mtu, \
2251 netdev_linux_get_ifindex, \
2252 netdev_linux_get_carrier, \
2253 netdev_linux_get_miimon, \
2254 netdev_linux_get_stats, \
2257 netdev_linux_get_features, \
2258 netdev_linux_set_advertisements, \
2259 netdev_linux_get_vlan_vid, \
2261 netdev_linux_set_policing, \
2262 netdev_linux_get_qos_types, \
2263 netdev_linux_get_qos_capabilities, \
2264 netdev_linux_get_qos, \
2265 netdev_linux_set_qos, \
2266 netdev_linux_get_queue, \
2267 netdev_linux_set_queue, \
2268 netdev_linux_delete_queue, \
2269 netdev_linux_get_queue_stats, \
2270 netdev_linux_dump_queues, \
2271 netdev_linux_dump_queue_stats, \
2273 netdev_linux_get_in4, \
2274 netdev_linux_set_in4, \
2275 netdev_linux_get_in6, \
2276 netdev_linux_add_router, \
2277 netdev_linux_get_next_hop, \
2278 netdev_linux_get_status, \
2279 netdev_linux_arp_lookup, \
2281 netdev_linux_update_flags, \
2283 netdev_linux_poll_add, \
2284 netdev_linux_poll_remove \
2287 const struct netdev_class netdev_linux_class =
2290 netdev_linux_create,
2291 netdev_linux_enumerate,
2292 NULL); /* set_stats */
2294 const struct netdev_class netdev_tap_class =
2297 netdev_linux_create_tap,
2298 NULL, /* enumerate */
2299 NULL); /* set_stats */
2301 const struct netdev_class netdev_internal_class =
2304 netdev_linux_create,
2305 NULL, /* enumerate */
2306 netdev_vport_set_stats);
2308 /* HTB traffic control class. */
2310 #define HTB_N_QUEUES 0xf000
2314 unsigned int max_rate; /* In bytes/s. */
2318 struct tc_queue tc_queue;
2319 unsigned int min_rate; /* In bytes/s. */
2320 unsigned int max_rate; /* In bytes/s. */
2321 unsigned int burst; /* In bytes. */
2322 unsigned int priority; /* Lower values are higher priorities. */
2326 htb_get__(const struct netdev *netdev)
2328 struct netdev_dev_linux *netdev_dev =
2329 netdev_dev_linux_cast(netdev_get_dev(netdev));
2330 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2334 htb_install__(struct netdev *netdev, uint64_t max_rate)
2336 struct netdev_dev_linux *netdev_dev =
2337 netdev_dev_linux_cast(netdev_get_dev(netdev));
2340 htb = xmalloc(sizeof *htb);
2341 tc_init(&htb->tc, &tc_ops_htb);
2342 htb->max_rate = max_rate;
2344 netdev_dev->tc = &htb->tc;
2347 /* Create an HTB qdisc.
2349 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2351 htb_setup_qdisc__(struct netdev *netdev)
2354 struct tc_htb_glob opt;
2355 struct ofpbuf request;
2356 struct tcmsg *tcmsg;
2358 tc_del_qdisc(netdev);
2360 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2361 NLM_F_EXCL | NLM_F_CREATE, &request);
2365 tcmsg->tcm_handle = tc_make_handle(1, 0);
2366 tcmsg->tcm_parent = TC_H_ROOT;
2368 nl_msg_put_string(&request, TCA_KIND, "htb");
2370 memset(&opt, 0, sizeof opt);
2371 opt.rate2quantum = 10;
2375 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2376 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2377 nl_msg_end_nested(&request, opt_offset);
2379 return tc_transact(&request, NULL);
2382 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2383 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2385 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2386 unsigned int parent, struct htb_class *class)
2389 struct tc_htb_opt opt;
2390 struct ofpbuf request;
2391 struct tcmsg *tcmsg;
2395 netdev_get_mtu(netdev, &mtu);
2396 if (mtu == INT_MAX) {
2397 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2398 netdev_get_name(netdev));
2402 memset(&opt, 0, sizeof opt);
2403 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2404 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2405 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2406 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2407 opt.prio = class->priority;
2409 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2413 tcmsg->tcm_handle = handle;
2414 tcmsg->tcm_parent = parent;
2416 nl_msg_put_string(&request, TCA_KIND, "htb");
2417 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2418 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2419 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2420 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2421 nl_msg_end_nested(&request, opt_offset);
2423 error = tc_transact(&request, NULL);
2425 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2426 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2427 netdev_get_name(netdev),
2428 tc_get_major(handle), tc_get_minor(handle),
2429 tc_get_major(parent), tc_get_minor(parent),
2430 class->min_rate, class->max_rate,
2431 class->burst, class->priority, strerror(error));
2436 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2437 * description of them into 'details'. The description complies with the
2438 * specification given in the vswitch database documentation for linux-htb
2441 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2443 static const struct nl_policy tca_htb_policy[] = {
2444 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2445 .min_len = sizeof(struct tc_htb_opt) },
2448 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2449 const struct tc_htb_opt *htb;
2451 if (!nl_parse_nested(nl_options, tca_htb_policy,
2452 attrs, ARRAY_SIZE(tca_htb_policy))) {
2453 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2457 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2458 class->min_rate = htb->rate.rate;
2459 class->max_rate = htb->ceil.rate;
2460 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2461 class->priority = htb->prio;
2466 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2467 struct htb_class *options,
2468 struct netdev_queue_stats *stats)
2470 struct nlattr *nl_options;
2471 unsigned int handle;
2474 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2475 if (!error && queue_id) {
2476 unsigned int major = tc_get_major(handle);
2477 unsigned int minor = tc_get_minor(handle);
2478 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2479 *queue_id = minor - 1;
2484 if (!error && options) {
2485 error = htb_parse_tca_options__(nl_options, options);
2491 htb_parse_qdisc_details__(struct netdev *netdev,
2492 const struct shash *details, struct htb_class *hc)
2494 const char *max_rate_s;
2496 max_rate_s = shash_find_data(details, "max-rate");
2497 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2498 if (!hc->max_rate) {
2501 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2502 hc->max_rate = netdev_features_to_bps(current) / 8;
2504 hc->min_rate = hc->max_rate;
2510 htb_parse_class_details__(struct netdev *netdev,
2511 const struct shash *details, struct htb_class *hc)
2513 const struct htb *htb = htb_get__(netdev);
2514 const char *min_rate_s = shash_find_data(details, "min-rate");
2515 const char *max_rate_s = shash_find_data(details, "max-rate");
2516 const char *burst_s = shash_find_data(details, "burst");
2517 const char *priority_s = shash_find_data(details, "priority");
2520 netdev_get_mtu(netdev, &mtu);
2521 if (mtu == INT_MAX) {
2522 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2523 netdev_get_name(netdev));
2527 /* HTB requires at least an mtu sized min-rate to send any traffic even
2528 * on uncongested links. */
2529 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2530 hc->min_rate = MAX(hc->min_rate, mtu);
2531 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2534 hc->max_rate = (max_rate_s
2535 ? strtoull(max_rate_s, NULL, 10) / 8
2537 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2538 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2542 * According to hints in the documentation that I've read, it is important
2543 * that 'burst' be at least as big as the largest frame that might be
2544 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2545 * but having it a bit too small is a problem. Since netdev_get_mtu()
2546 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2547 * the MTU. We actually add 64, instead of 14, as a guard against
2548 * additional headers get tacked on somewhere that we're not aware of. */
2549 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2550 hc->burst = MAX(hc->burst, mtu + 64);
2553 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2559 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2560 unsigned int parent, struct htb_class *options,
2561 struct netdev_queue_stats *stats)
2563 struct ofpbuf *reply;
2566 error = tc_query_class(netdev, handle, parent, &reply);
2568 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2569 ofpbuf_delete(reply);
2575 htb_tc_install(struct netdev *netdev, const struct shash *details)
2579 error = htb_setup_qdisc__(netdev);
2581 struct htb_class hc;
2583 htb_parse_qdisc_details__(netdev, details, &hc);
2584 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2585 tc_make_handle(1, 0), &hc);
2587 htb_install__(netdev, hc.max_rate);
2593 static struct htb_class *
2594 htb_class_cast__(const struct tc_queue *queue)
2596 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2600 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2601 const struct htb_class *hc)
2603 struct htb *htb = htb_get__(netdev);
2604 size_t hash = hash_int(queue_id, 0);
2605 struct tc_queue *queue;
2606 struct htb_class *hcp;
2608 queue = tc_find_queue__(netdev, queue_id, hash);
2610 hcp = htb_class_cast__(queue);
2612 hcp = xmalloc(sizeof *hcp);
2613 queue = &hcp->tc_queue;
2614 queue->queue_id = queue_id;
2615 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2618 hcp->min_rate = hc->min_rate;
2619 hcp->max_rate = hc->max_rate;
2620 hcp->burst = hc->burst;
2621 hcp->priority = hc->priority;
2625 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2628 struct nl_dump dump;
2629 struct htb_class hc;
2631 /* Get qdisc options. */
2633 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2634 htb_install__(netdev, hc.max_rate);
2637 if (!start_queue_dump(netdev, &dump)) {
2640 while (nl_dump_next(&dump, &msg)) {
2641 unsigned int queue_id;
2643 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2644 htb_update_queue__(netdev, queue_id, &hc);
2647 nl_dump_done(&dump);
2653 htb_tc_destroy(struct tc *tc)
2655 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2656 struct htb_class *hc, *next;
2658 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2659 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2667 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2669 const struct htb *htb = htb_get__(netdev);
2670 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2675 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2677 struct htb_class hc;
2680 htb_parse_qdisc_details__(netdev, details, &hc);
2681 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2682 tc_make_handle(1, 0), &hc);
2684 htb_get__(netdev)->max_rate = hc.max_rate;
2690 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2691 const struct tc_queue *queue, struct shash *details)
2693 const struct htb_class *hc = htb_class_cast__(queue);
2695 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2696 if (hc->min_rate != hc->max_rate) {
2697 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2699 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2701 shash_add(details, "priority", xasprintf("%u", hc->priority));
2707 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2708 const struct shash *details)
2710 struct htb_class hc;
2713 error = htb_parse_class_details__(netdev, details, &hc);
2718 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2719 tc_make_handle(1, 0xfffe), &hc);
2724 htb_update_queue__(netdev, queue_id, &hc);
2729 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2731 struct htb_class *hc = htb_class_cast__(queue);
2732 struct htb *htb = htb_get__(netdev);
2735 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2737 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2744 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2745 struct netdev_queue_stats *stats)
2747 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2748 tc_make_handle(1, 0xfffe), NULL, stats);
2752 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2753 const struct ofpbuf *nlmsg,
2754 netdev_dump_queue_stats_cb *cb, void *aux)
2756 struct netdev_queue_stats stats;
2757 unsigned int handle, major, minor;
2760 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2765 major = tc_get_major(handle);
2766 minor = tc_get_minor(handle);
2767 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2768 (*cb)(minor - 1, &stats, aux);
2773 static const struct tc_ops tc_ops_htb = {
2774 "htb", /* linux_name */
2775 "linux-htb", /* ovs_name */
2776 HTB_N_QUEUES, /* n_queues */
2785 htb_class_get_stats,
2786 htb_class_dump_stats
2789 /* "linux-hfsc" traffic control class. */
2791 #define HFSC_N_QUEUES 0xf000
2799 struct tc_queue tc_queue;
2804 static struct hfsc *
2805 hfsc_get__(const struct netdev *netdev)
2807 struct netdev_dev_linux *netdev_dev;
2808 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2809 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2812 static struct hfsc_class *
2813 hfsc_class_cast__(const struct tc_queue *queue)
2815 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2819 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2821 struct netdev_dev_linux * netdev_dev;
2824 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2825 hfsc = xmalloc(sizeof *hfsc);
2826 tc_init(&hfsc->tc, &tc_ops_hfsc);
2827 hfsc->max_rate = max_rate;
2828 netdev_dev->tc = &hfsc->tc;
2832 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2833 const struct hfsc_class *hc)
2837 struct hfsc_class *hcp;
2838 struct tc_queue *queue;
2840 hfsc = hfsc_get__(netdev);
2841 hash = hash_int(queue_id, 0);
2843 queue = tc_find_queue__(netdev, queue_id, hash);
2845 hcp = hfsc_class_cast__(queue);
2847 hcp = xmalloc(sizeof *hcp);
2848 queue = &hcp->tc_queue;
2849 queue->queue_id = queue_id;
2850 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2853 hcp->min_rate = hc->min_rate;
2854 hcp->max_rate = hc->max_rate;
2858 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2860 const struct tc_service_curve *rsc, *fsc, *usc;
2861 static const struct nl_policy tca_hfsc_policy[] = {
2863 .type = NL_A_UNSPEC,
2865 .min_len = sizeof(struct tc_service_curve),
2868 .type = NL_A_UNSPEC,
2870 .min_len = sizeof(struct tc_service_curve),
2873 .type = NL_A_UNSPEC,
2875 .min_len = sizeof(struct tc_service_curve),
2878 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2880 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2881 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2882 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2886 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2887 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2888 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2890 if (rsc->m1 != 0 || rsc->d != 0 ||
2891 fsc->m1 != 0 || fsc->d != 0 ||
2892 usc->m1 != 0 || usc->d != 0) {
2893 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2894 "Non-linear service curves are not supported.");
2898 if (rsc->m2 != fsc->m2) {
2899 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2900 "Real-time service curves are not supported ");
2904 if (rsc->m2 > usc->m2) {
2905 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2906 "Min-rate service curve is greater than "
2907 "the max-rate service curve.");
2911 class->min_rate = fsc->m2;
2912 class->max_rate = usc->m2;
2917 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2918 struct hfsc_class *options,
2919 struct netdev_queue_stats *stats)
2922 unsigned int handle;
2923 struct nlattr *nl_options;
2925 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2931 unsigned int major, minor;
2933 major = tc_get_major(handle);
2934 minor = tc_get_minor(handle);
2935 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2936 *queue_id = minor - 1;
2943 error = hfsc_parse_tca_options__(nl_options, options);
2950 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2951 unsigned int parent, struct hfsc_class *options,
2952 struct netdev_queue_stats *stats)
2955 struct ofpbuf *reply;
2957 error = tc_query_class(netdev, handle, parent, &reply);
2962 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2963 ofpbuf_delete(reply);
2968 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2969 struct hfsc_class *class)
2972 const char *max_rate_s;
2974 max_rate_s = shash_find_data(details, "max-rate");
2975 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2980 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2981 max_rate = netdev_features_to_bps(current) / 8;
2984 class->min_rate = max_rate;
2985 class->max_rate = max_rate;
2989 hfsc_parse_class_details__(struct netdev *netdev,
2990 const struct shash *details,
2991 struct hfsc_class * class)
2993 const struct hfsc *hfsc;
2994 uint32_t min_rate, max_rate;
2995 const char *min_rate_s, *max_rate_s;
2997 hfsc = hfsc_get__(netdev);
2998 min_rate_s = shash_find_data(details, "min-rate");
2999 max_rate_s = shash_find_data(details, "max-rate");
3001 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3002 min_rate = MAX(min_rate, 1);
3003 min_rate = MIN(min_rate, hfsc->max_rate);
3005 max_rate = (max_rate_s
3006 ? strtoull(max_rate_s, NULL, 10) / 8
3008 max_rate = MAX(max_rate, min_rate);
3009 max_rate = MIN(max_rate, hfsc->max_rate);
3011 class->min_rate = min_rate;
3012 class->max_rate = max_rate;
3017 /* Create an HFSC qdisc.
3019 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3021 hfsc_setup_qdisc__(struct netdev * netdev)
3023 struct tcmsg *tcmsg;
3024 struct ofpbuf request;
3025 struct tc_hfsc_qopt opt;
3027 tc_del_qdisc(netdev);
3029 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3030 NLM_F_EXCL | NLM_F_CREATE, &request);
3036 tcmsg->tcm_handle = tc_make_handle(1, 0);
3037 tcmsg->tcm_parent = TC_H_ROOT;
3039 memset(&opt, 0, sizeof opt);
3042 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3043 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3045 return tc_transact(&request, NULL);
3048 /* Create an HFSC class.
3050 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3051 * sc rate <min_rate> ul rate <max_rate>" */
3053 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3054 unsigned int parent, struct hfsc_class *class)
3058 struct tcmsg *tcmsg;
3059 struct ofpbuf request;
3060 struct tc_service_curve min, max;
3062 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3068 tcmsg->tcm_handle = handle;
3069 tcmsg->tcm_parent = parent;
3073 min.m2 = class->min_rate;
3077 max.m2 = class->max_rate;
3079 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3080 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3081 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3082 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3083 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3084 nl_msg_end_nested(&request, opt_offset);
3086 error = tc_transact(&request, NULL);
3088 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3089 "min-rate %ubps, max-rate %ubps (%s)",
3090 netdev_get_name(netdev),
3091 tc_get_major(handle), tc_get_minor(handle),
3092 tc_get_major(parent), tc_get_minor(parent),
3093 class->min_rate, class->max_rate, strerror(error));
3100 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3103 struct hfsc_class class;
3105 error = hfsc_setup_qdisc__(netdev);
3111 hfsc_parse_qdisc_details__(netdev, details, &class);
3112 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3113 tc_make_handle(1, 0), &class);
3119 hfsc_install__(netdev, class.max_rate);
3124 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3127 struct nl_dump dump;
3128 struct hfsc_class hc;
3131 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3132 hfsc_install__(netdev, hc.max_rate);
3134 if (!start_queue_dump(netdev, &dump)) {
3138 while (nl_dump_next(&dump, &msg)) {
3139 unsigned int queue_id;
3141 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3142 hfsc_update_queue__(netdev, queue_id, &hc);
3146 nl_dump_done(&dump);
3151 hfsc_tc_destroy(struct tc *tc)
3154 struct hfsc_class *hc, *next;
3156 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3158 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3159 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3168 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3170 const struct hfsc *hfsc;
3171 hfsc = hfsc_get__(netdev);
3172 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3177 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3180 struct hfsc_class class;
3182 hfsc_parse_qdisc_details__(netdev, details, &class);
3183 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3184 tc_make_handle(1, 0), &class);
3187 hfsc_get__(netdev)->max_rate = class.max_rate;
3194 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3195 const struct tc_queue *queue, struct shash *details)
3197 const struct hfsc_class *hc;
3199 hc = hfsc_class_cast__(queue);
3200 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3201 if (hc->min_rate != hc->max_rate) {
3202 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3208 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3209 const struct shash *details)
3212 struct hfsc_class class;
3214 error = hfsc_parse_class_details__(netdev, details, &class);
3219 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3220 tc_make_handle(1, 0xfffe), &class);
3225 hfsc_update_queue__(netdev, queue_id, &class);
3230 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3234 struct hfsc_class *hc;
3236 hc = hfsc_class_cast__(queue);
3237 hfsc = hfsc_get__(netdev);
3239 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3241 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3248 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3249 struct netdev_queue_stats *stats)
3251 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3252 tc_make_handle(1, 0xfffe), NULL, stats);
3256 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3257 const struct ofpbuf *nlmsg,
3258 netdev_dump_queue_stats_cb *cb, void *aux)
3260 struct netdev_queue_stats stats;
3261 unsigned int handle, major, minor;
3264 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3269 major = tc_get_major(handle);
3270 minor = tc_get_minor(handle);
3271 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3272 (*cb)(minor - 1, &stats, aux);
3277 static const struct tc_ops tc_ops_hfsc = {
3278 "hfsc", /* linux_name */
3279 "linux-hfsc", /* ovs_name */
3280 HFSC_N_QUEUES, /* n_queues */
3281 hfsc_tc_install, /* tc_install */
3282 hfsc_tc_load, /* tc_load */
3283 hfsc_tc_destroy, /* tc_destroy */
3284 hfsc_qdisc_get, /* qdisc_get */
3285 hfsc_qdisc_set, /* qdisc_set */
3286 hfsc_class_get, /* class_get */
3287 hfsc_class_set, /* class_set */
3288 hfsc_class_delete, /* class_delete */
3289 hfsc_class_get_stats, /* class_get_stats */
3290 hfsc_class_dump_stats /* class_dump_stats */
3293 /* "linux-default" traffic control class.
3295 * This class represents the default, unnamed Linux qdisc. It corresponds to
3296 * the "" (empty string) QoS type in the OVS database. */
3299 default_install__(struct netdev *netdev)
3301 struct netdev_dev_linux *netdev_dev =
3302 netdev_dev_linux_cast(netdev_get_dev(netdev));
3303 static struct tc *tc;
3306 tc = xmalloc(sizeof *tc);
3307 tc_init(tc, &tc_ops_default);
3309 netdev_dev->tc = tc;
3313 default_tc_install(struct netdev *netdev,
3314 const struct shash *details OVS_UNUSED)
3316 default_install__(netdev);
3321 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3323 default_install__(netdev);
3327 static const struct tc_ops tc_ops_default = {
3328 NULL, /* linux_name */
3333 NULL, /* tc_destroy */
3334 NULL, /* qdisc_get */
3335 NULL, /* qdisc_set */
3336 NULL, /* class_get */
3337 NULL, /* class_set */
3338 NULL, /* class_delete */
3339 NULL, /* class_get_stats */
3340 NULL /* class_dump_stats */
3343 /* "linux-other" traffic control class.
3348 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3350 struct netdev_dev_linux *netdev_dev =
3351 netdev_dev_linux_cast(netdev_get_dev(netdev));
3352 static struct tc *tc;
3355 tc = xmalloc(sizeof *tc);
3356 tc_init(tc, &tc_ops_other);
3358 netdev_dev->tc = tc;
3362 static const struct tc_ops tc_ops_other = {
3363 NULL, /* linux_name */
3364 "linux-other", /* ovs_name */
3366 NULL, /* tc_install */
3368 NULL, /* tc_destroy */
3369 NULL, /* qdisc_get */
3370 NULL, /* qdisc_set */
3371 NULL, /* class_get */
3372 NULL, /* class_set */
3373 NULL, /* class_delete */
3374 NULL, /* class_get_stats */
3375 NULL /* class_dump_stats */
3378 /* Traffic control. */
3380 /* Number of kernel "tc" ticks per second. */
3381 static double ticks_per_s;
3383 /* Number of kernel "jiffies" per second. This is used for the purpose of
3384 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3385 * one jiffy's worth of data.
3387 * There are two possibilities here:
3389 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3390 * approximate range of 100 to 1024. That means that we really need to
3391 * make sure that the qdisc can buffer that much data.
3393 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3394 * has finely granular timers and there's no need to fudge additional room
3395 * for buffers. (There's no extra effort needed to implement that: the
3396 * large 'buffer_hz' is used as a divisor, so practically any number will
3397 * come out as 0 in the division. Small integer results in the case of
3398 * really high dividends won't have any real effect anyhow.)
3400 static unsigned int buffer_hz;
3402 /* Returns tc handle 'major':'minor'. */
3404 tc_make_handle(unsigned int major, unsigned int minor)
3406 return TC_H_MAKE(major << 16, minor);
3409 /* Returns the major number from 'handle'. */
3411 tc_get_major(unsigned int handle)
3413 return TC_H_MAJ(handle) >> 16;
3416 /* Returns the minor number from 'handle'. */
3418 tc_get_minor(unsigned int handle)
3420 return TC_H_MIN(handle);
3423 static struct tcmsg *
3424 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3425 struct ofpbuf *request)
3427 struct tcmsg *tcmsg;
3431 error = get_ifindex(netdev, &ifindex);
3436 ofpbuf_init(request, 512);
3437 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3438 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3439 tcmsg->tcm_family = AF_UNSPEC;
3440 tcmsg->tcm_ifindex = ifindex;
3441 /* Caller should fill in tcmsg->tcm_handle. */
3442 /* Caller should fill in tcmsg->tcm_parent. */
3448 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3450 int error = nl_sock_transact(rtnl_sock, request, replyp);
3451 ofpbuf_uninit(request);
3458 /* The values in psched are not individually very meaningful, but they are
3459 * important. The tables below show some values seen in the wild.
3463 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3464 * (Before that, there are hints that it was 1000000000.)
3466 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3470 * -----------------------------------
3471 * [1] 000c8000 000f4240 000f4240 00000064
3472 * [2] 000003e8 00000400 000f4240 3b9aca00
3473 * [3] 000003e8 00000400 000f4240 3b9aca00
3474 * [4] 000003e8 00000400 000f4240 00000064
3475 * [5] 000003e8 00000040 000f4240 3b9aca00
3476 * [6] 000003e8 00000040 000f4240 000000f9
3478 * a b c d ticks_per_s buffer_hz
3479 * ------- --------- ---------- ------------- ----------- -------------
3480 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3481 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3482 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3483 * [4] 1,000 1,024 1,000,000 100 976,562 100
3484 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3485 * [6] 1,000 64 1,000,000 249 15,625,000 249
3487 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3488 * [2] 2.6.26-1-686-bigmem from Debian lenny
3489 * [3] 2.6.26-2-sparc64 from Debian lenny
3490 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3491 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3492 * [6] 2.6.34 from kernel.org on KVM
3494 static const char fn[] = "/proc/net/psched";
3495 unsigned int a, b, c, d;
3501 stream = fopen(fn, "r");
3503 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3507 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3508 VLOG_WARN("%s: read failed", fn);
3512 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3516 VLOG_WARN("%s: invalid scheduler parameters", fn);
3520 ticks_per_s = (double) a * c / b;
3524 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3527 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3530 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3531 * rate of 'rate' bytes per second. */
3533 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3538 return (rate * ticks) / ticks_per_s;
3541 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3542 * rate of 'rate' bytes per second. */
3544 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3549 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3552 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3553 * a transmission rate of 'rate' bytes per second. */
3555 tc_buffer_per_jiffy(unsigned int rate)
3560 return rate / buffer_hz;
3563 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3564 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3565 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3566 * stores NULL into it if it is absent.
3568 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3571 * Returns 0 if successful, otherwise a positive errno value. */
3573 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3574 struct nlattr **options)
3576 static const struct nl_policy tca_policy[] = {
3577 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3578 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3580 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3582 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3583 tca_policy, ta, ARRAY_SIZE(ta))) {
3584 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3589 *kind = nl_attr_get_string(ta[TCA_KIND]);
3593 *options = ta[TCA_OPTIONS];
3608 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3609 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3610 * into '*options', and its queue statistics into '*stats'. Any of the output
3611 * arguments may be null.
3613 * Returns 0 if successful, otherwise a positive errno value. */
3615 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3616 struct nlattr **options, struct netdev_queue_stats *stats)
3618 static const struct nl_policy tca_policy[] = {
3619 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3620 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3622 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3624 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3625 tca_policy, ta, ARRAY_SIZE(ta))) {
3626 VLOG_WARN_RL(&rl, "failed to parse class message");
3631 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3632 *handlep = tc->tcm_handle;
3636 *options = ta[TCA_OPTIONS];
3640 const struct gnet_stats_queue *gsq;
3641 struct gnet_stats_basic gsb;
3643 static const struct nl_policy stats_policy[] = {
3644 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3645 .min_len = sizeof gsb },
3646 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3647 .min_len = sizeof *gsq },
3649 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3651 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3652 sa, ARRAY_SIZE(sa))) {
3653 VLOG_WARN_RL(&rl, "failed to parse class stats");
3657 /* Alignment issues screw up the length of struct gnet_stats_basic on
3658 * some arch/bitsize combinations. Newer versions of Linux have a
3659 * struct gnet_stats_basic_packed, but we can't depend on that. The
3660 * easiest thing to do is just to make a copy. */
3661 memset(&gsb, 0, sizeof gsb);
3662 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3663 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3664 stats->tx_bytes = gsb.bytes;
3665 stats->tx_packets = gsb.packets;
3667 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3668 stats->tx_errors = gsq->drops;
3678 memset(stats, 0, sizeof *stats);
3683 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3686 tc_query_class(const struct netdev *netdev,
3687 unsigned int handle, unsigned int parent,
3688 struct ofpbuf **replyp)
3690 struct ofpbuf request;
3691 struct tcmsg *tcmsg;
3694 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3698 tcmsg->tcm_handle = handle;
3699 tcmsg->tcm_parent = parent;
3701 error = tc_transact(&request, replyp);
3703 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3704 netdev_get_name(netdev),
3705 tc_get_major(handle), tc_get_minor(handle),
3706 tc_get_major(parent), tc_get_minor(parent),
3712 /* Equivalent to "tc class del dev <name> handle <handle>". */
3714 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3716 struct ofpbuf request;
3717 struct tcmsg *tcmsg;
3720 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3724 tcmsg->tcm_handle = handle;
3725 tcmsg->tcm_parent = 0;
3727 error = tc_transact(&request, NULL);
3729 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3730 netdev_get_name(netdev),
3731 tc_get_major(handle), tc_get_minor(handle),
3737 /* Equivalent to "tc qdisc del dev <name> root". */
3739 tc_del_qdisc(struct netdev *netdev)
3741 struct netdev_dev_linux *netdev_dev =
3742 netdev_dev_linux_cast(netdev_get_dev(netdev));
3743 struct ofpbuf request;
3744 struct tcmsg *tcmsg;
3747 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3751 tcmsg->tcm_handle = tc_make_handle(1, 0);
3752 tcmsg->tcm_parent = TC_H_ROOT;
3754 error = tc_transact(&request, NULL);
3755 if (error == EINVAL) {
3756 /* EINVAL probably means that the default qdisc was in use, in which
3757 * case we've accomplished our purpose. */
3760 if (!error && netdev_dev->tc) {
3761 if (netdev_dev->tc->ops->tc_destroy) {
3762 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3764 netdev_dev->tc = NULL;
3769 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3770 * kernel to determine what they are. Returns 0 if successful, otherwise a
3771 * positive errno value. */
3773 tc_query_qdisc(const struct netdev *netdev)
3775 struct netdev_dev_linux *netdev_dev =
3776 netdev_dev_linux_cast(netdev_get_dev(netdev));
3777 struct ofpbuf request, *qdisc;
3778 const struct tc_ops *ops;
3779 struct tcmsg *tcmsg;
3783 if (netdev_dev->tc) {
3787 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3788 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3789 * 2.6.35 without that fix backported to it.
3791 * To avoid the OOPS, we must not make a request that would attempt to dump
3792 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3793 * few others. There are a few ways that I can see to do this, but most of
3794 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3795 * technique chosen here is to assume that any non-default qdisc that we
3796 * create will have a class with handle 1:0. The built-in qdiscs only have
3797 * a class with handle 0:0.
3799 * We could check for Linux 2.6.35+ and use a more straightforward method
3801 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3805 tcmsg->tcm_handle = tc_make_handle(1, 0);
3806 tcmsg->tcm_parent = 0;
3808 /* Figure out what tc class to instantiate. */
3809 error = tc_transact(&request, &qdisc);
3813 error = tc_parse_qdisc(qdisc, &kind, NULL);
3815 ops = &tc_ops_other;
3817 ops = tc_lookup_linux_name(kind);
3819 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3820 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3822 ops = &tc_ops_other;
3825 } else if (error == ENOENT) {
3826 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3827 * other entity that doesn't have a handle 1:0. We will assume
3828 * that it's the system default qdisc. */
3829 ops = &tc_ops_default;
3832 /* Who knows? Maybe the device got deleted. */
3833 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3834 netdev_get_name(netdev), strerror(error));
3835 ops = &tc_ops_other;
3838 /* Instantiate it. */
3839 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3840 assert((load_error == 0) == (netdev_dev->tc != NULL));
3841 ofpbuf_delete(qdisc);
3843 return error ? error : load_error;
3846 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3847 approximate the time to transmit packets of various lengths. For an MTU of
3848 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3849 represents two possible packet lengths; for a MTU of 513 through 1024, four
3850 possible lengths; and so on.
3852 Returns, for the specified 'mtu', the number of bits that packet lengths
3853 need to be shifted right to fit within such a 256-entry table. */
3855 tc_calc_cell_log(unsigned int mtu)
3860 mtu = ETH_PAYLOAD_MAX;
3862 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3864 for (cell_log = 0; mtu >= 256; cell_log++) {
3871 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3874 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3876 memset(rate, 0, sizeof *rate);
3877 rate->cell_log = tc_calc_cell_log(mtu);
3878 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3879 /* rate->cell_align = 0; */ /* distro headers. */
3880 rate->mpu = ETH_TOTAL_MIN;
3884 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3885 * attribute of the specified "type".
3887 * See tc_calc_cell_log() above for a description of "rtab"s. */
3889 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3894 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3895 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3896 unsigned packet_size = (i + 1) << rate->cell_log;
3897 if (packet_size < rate->mpu) {
3898 packet_size = rate->mpu;
3900 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3904 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3905 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3906 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3909 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3911 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3912 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3916 /* Utility functions. */
3919 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3921 /* Policy for RTNLGRP_LINK messages.
3923 * There are *many* more fields in these messages, but currently we only
3924 * care about these fields. */
3925 static const struct nl_policy rtnlgrp_link_policy[] = {
3926 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3927 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3928 .min_len = sizeof(struct rtnl_link_stats) },
3931 struct ofpbuf request;
3932 struct ofpbuf *reply;
3933 struct ifinfomsg *ifi;
3934 const struct rtnl_link_stats *rtnl_stats;
3935 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3938 ofpbuf_init(&request, 0);
3939 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3940 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3941 ifi->ifi_family = PF_UNSPEC;
3942 ifi->ifi_index = ifindex;
3943 error = nl_sock_transact(rtnl_sock, &request, &reply);
3944 ofpbuf_uninit(&request);
3949 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3950 rtnlgrp_link_policy,
3951 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3952 ofpbuf_delete(reply);
3956 if (!attrs[IFLA_STATS]) {
3957 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3958 ofpbuf_delete(reply);
3962 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3963 stats->rx_packets = rtnl_stats->rx_packets;
3964 stats->tx_packets = rtnl_stats->tx_packets;
3965 stats->rx_bytes = rtnl_stats->rx_bytes;
3966 stats->tx_bytes = rtnl_stats->tx_bytes;
3967 stats->rx_errors = rtnl_stats->rx_errors;
3968 stats->tx_errors = rtnl_stats->tx_errors;
3969 stats->rx_dropped = rtnl_stats->rx_dropped;
3970 stats->tx_dropped = rtnl_stats->tx_dropped;
3971 stats->multicast = rtnl_stats->multicast;
3972 stats->collisions = rtnl_stats->collisions;
3973 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3974 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3975 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3976 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3977 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3978 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3979 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3980 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3981 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3982 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3983 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3985 ofpbuf_delete(reply);
3991 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3993 static const char fn[] = "/proc/net/dev";
3998 stream = fopen(fn, "r");
4000 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4005 while (fgets(line, sizeof line, stream)) {
4008 #define X64 "%"SCNu64
4011 X64 X64 X64 X64 X64 X64 X64 "%*u"
4012 X64 X64 X64 X64 X64 X64 X64 "%*u",
4018 &stats->rx_fifo_errors,
4019 &stats->rx_frame_errors,
4025 &stats->tx_fifo_errors,
4027 &stats->tx_carrier_errors) != 15) {
4028 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4029 } else if (!strcmp(devname, netdev_name)) {
4030 stats->rx_length_errors = UINT64_MAX;
4031 stats->rx_over_errors = UINT64_MAX;
4032 stats->rx_crc_errors = UINT64_MAX;
4033 stats->rx_missed_errors = UINT64_MAX;
4034 stats->tx_aborted_errors = UINT64_MAX;
4035 stats->tx_heartbeat_errors = UINT64_MAX;
4036 stats->tx_window_errors = UINT64_MAX;
4042 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4048 get_flags(const struct netdev *netdev, int *flags)
4053 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4055 *flags = ifr.ifr_flags;
4060 set_flags(struct netdev *netdev, int flags)
4064 ifr.ifr_flags = flags;
4065 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4070 do_get_ifindex(const char *netdev_name)
4074 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4075 COVERAGE_INC(netdev_get_ifindex);
4076 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4077 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4078 netdev_name, strerror(errno));
4081 return ifr.ifr_ifindex;
4085 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4087 struct netdev_dev_linux *netdev_dev =
4088 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4090 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4091 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4095 netdev_dev->cache_valid |= VALID_IFINDEX;
4096 netdev_dev->ifindex = ifindex;
4098 *ifindexp = netdev_dev->ifindex;
4103 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4108 memset(&ifr, 0, sizeof ifr);
4109 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4110 COVERAGE_INC(netdev_get_hwaddr);
4111 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4112 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4113 netdev_name, strerror(errno));
4116 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4117 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4118 VLOG_WARN("%s device has unknown hardware address family %d",
4119 netdev_name, hwaddr_family);
4121 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4126 set_etheraddr(const char *netdev_name, int hwaddr_family,
4127 const uint8_t mac[ETH_ADDR_LEN])
4131 memset(&ifr, 0, sizeof ifr);
4132 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4133 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4134 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4135 COVERAGE_INC(netdev_set_hwaddr);
4136 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4137 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4138 netdev_name, strerror(errno));
4145 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4146 int cmd, const char *cmd_name)
4150 memset(&ifr, 0, sizeof ifr);
4151 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4152 ifr.ifr_data = (caddr_t) ecmd;
4155 COVERAGE_INC(netdev_ethtool);
4156 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4159 if (errno != EOPNOTSUPP) {
4160 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4161 "failed: %s", cmd_name, name, strerror(errno));
4163 /* The device doesn't support this operation. That's pretty
4164 * common, so there's no point in logging anything. */
4171 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4172 const char *cmd_name)
4174 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4175 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4176 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4184 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4185 int cmd, const char *cmd_name)
4190 ifr.ifr_addr.sa_family = AF_INET;
4191 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4193 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4194 *ip = sin->sin_addr;