2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_get_vlan_vid);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_CARRIER = 1 << 5,
117 VALID_POLICING = 1 << 6,
118 VALID_HAVE_VPORT_STATS = 1 << 7
126 /* Traffic control. */
128 /* An instance of a traffic control class. Always associated with a particular
131 * Each TC implementation subclasses this with whatever additional data it
134 const struct tc_ops *ops;
135 struct hmap queues; /* Contains "struct tc_queue"s.
136 * Read by generic TC layer.
137 * Written only by TC implementation. */
140 /* One traffic control queue.
142 * Each TC implementation subclasses this with whatever additional data it
145 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
146 unsigned int queue_id; /* OpenFlow queue ID. */
149 /* A particular kind of traffic control. Each implementation generally maps to
150 * one particular Linux qdisc class.
152 * The functions below return 0 if successful or a positive errno value on
153 * failure, except where otherwise noted. All of them must be provided, except
154 * where otherwise noted. */
156 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
157 * This is null for tc_ops_default and tc_ops_other, for which there are no
158 * appropriate values. */
159 const char *linux_name;
161 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
162 const char *ovs_name;
164 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
165 * queues. The queues are numbered 0 through n_queues - 1. */
166 unsigned int n_queues;
168 /* Called to install this TC class on 'netdev'. The implementation should
169 * make the Netlink calls required to set up 'netdev' with the right qdisc
170 * and configure it according to 'details'. The implementation may assume
171 * that the current qdisc is the default; that is, there is no need for it
172 * to delete the current qdisc before installing itself.
174 * The contents of 'details' should be documented as valid for 'ovs_name'
175 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
176 * (which is built as ovs-vswitchd.conf.db(8)).
178 * This function must return 0 if and only if it sets 'netdev->tc' to an
179 * initialized 'struct tc'.
181 * (This function is null for tc_ops_other, which cannot be installed. For
182 * other TC classes it should always be nonnull.) */
183 int (*tc_install)(struct netdev *netdev, const struct shash *details);
185 /* Called when the netdev code determines (through a Netlink query) that
186 * this TC class's qdisc is installed on 'netdev', but we didn't install
187 * it ourselves and so don't know any of the details.
189 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
190 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
191 * implementation should parse the other attributes of 'nlmsg' as
192 * necessary to determine its configuration. If necessary it should also
193 * use Netlink queries to determine the configuration of queues on
196 * This function must return 0 if and only if it sets 'netdev->tc' to an
197 * initialized 'struct tc'. */
198 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
200 /* Destroys the data structures allocated by the implementation as part of
201 * 'tc'. (This includes destroying 'tc->queues' by calling
204 * The implementation should not need to perform any Netlink calls. If
205 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
206 * (But it may not be desirable.)
208 * This function may be null if 'tc' is trivial. */
209 void (*tc_destroy)(struct tc *tc);
211 /* Retrieves details of 'netdev->tc' configuration into 'details'.
213 * The implementation should not need to perform any Netlink calls, because
214 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
215 * cached the configuration.
217 * The contents of 'details' should be documented as valid for 'ovs_name'
218 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
219 * (which is built as ovs-vswitchd.conf.db(8)).
221 * This function may be null if 'tc' is not configurable.
223 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
225 /* Reconfigures 'netdev->tc' according to 'details', performing any
226 * required Netlink calls to complete the reconfiguration.
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
230 * (which is built as ovs-vswitchd.conf.db(8)).
232 * This function may be null if 'tc' is not configurable.
234 int (*qdisc_set)(struct netdev *, const struct shash *details);
236 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
237 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
239 * The contents of 'details' should be documented as valid for 'ovs_name'
240 * in the "other_config" column in the "Queue" table in
241 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
243 * The implementation should not need to perform any Netlink calls, because
244 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
245 * cached the queue configuration.
247 * This function may be null if 'tc' does not have queues ('n_queues' is
249 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
250 struct shash *details);
252 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
253 * 'details', perfoming any required Netlink calls to complete the
254 * reconfiguration. The caller ensures that 'queue_id' is less than
257 * The contents of 'details' should be documented as valid for 'ovs_name'
258 * in the "other_config" column in the "Queue" table in
259 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
261 * This function may be null if 'tc' does not have queues or its queues are
262 * not configurable. */
263 int (*class_set)(struct netdev *, unsigned int queue_id,
264 const struct shash *details);
266 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
267 * tc_queue's within 'netdev->tc->queues'.
269 * This function may be null if 'tc' does not have queues or its queues
270 * cannot be deleted. */
271 int (*class_delete)(struct netdev *, struct tc_queue *queue);
273 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
274 * 'struct tc_queue's within 'netdev->tc->queues'.
276 * On success, initializes '*stats'.
278 * This function may be null if 'tc' does not have queues or if it cannot
279 * report queue statistics. */
280 int (*class_get_stats)(const struct netdev *netdev,
281 const struct tc_queue *queue,
282 struct netdev_queue_stats *stats);
284 /* Extracts queue stats from 'nlmsg', which is a response to a
285 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
287 * This function may be null if 'tc' does not have queues or if it cannot
288 * report queue statistics. */
289 int (*class_dump_stats)(const struct netdev *netdev,
290 const struct ofpbuf *nlmsg,
291 netdev_dump_queue_stats_cb *cb, void *aux);
295 tc_init(struct tc *tc, const struct tc_ops *ops)
298 hmap_init(&tc->queues);
302 tc_destroy(struct tc *tc)
304 hmap_destroy(&tc->queues);
307 static const struct tc_ops tc_ops_htb;
308 static const struct tc_ops tc_ops_hfsc;
309 static const struct tc_ops tc_ops_default;
310 static const struct tc_ops tc_ops_other;
312 static const struct tc_ops *tcs[] = {
313 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
314 &tc_ops_hfsc, /* Hierarchical fair service curve. */
315 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
316 &tc_ops_other, /* Some other qdisc. */
320 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
321 static unsigned int tc_get_major(unsigned int handle);
322 static unsigned int tc_get_minor(unsigned int handle);
324 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
325 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
326 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
328 static struct tcmsg *tc_make_request(const struct netdev *, int type,
329 unsigned int flags, struct ofpbuf *);
330 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
333 struct nlattr **options);
334 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
335 struct nlattr **options,
336 struct netdev_queue_stats *);
337 static int tc_query_class(const struct netdev *,
338 unsigned int handle, unsigned int parent,
339 struct ofpbuf **replyp);
340 static int tc_delete_class(const struct netdev *, unsigned int handle);
342 static int tc_del_qdisc(struct netdev *netdev);
343 static int tc_query_qdisc(const struct netdev *netdev);
345 static int tc_calc_cell_log(unsigned int mtu);
346 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
347 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
348 const struct tc_ratespec *rate);
349 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
351 struct netdev_dev_linux {
352 struct netdev_dev netdev_dev;
354 struct shash_node *shash_node;
355 unsigned int cache_valid;
356 unsigned int change_seq;
358 bool miimon; /* Link status of last poll. */
359 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
360 struct timer miimon_timer;
362 /* The following are figured out "on demand" only. They are only valid
363 * when the corresponding VALID_* bit in 'cache_valid' is set. */
365 uint8_t etheraddr[ETH_ADDR_LEN];
366 struct in_addr address, netmask;
370 uint32_t kbits_rate; /* Policing data. */
371 uint32_t kbits_burst;
372 bool have_vport_stats;
376 struct tap_state tap;
380 struct netdev_linux {
381 struct netdev netdev;
385 /* Sockets used for ioctl operations. */
386 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
388 /* A Netlink routing socket that is not subscribed to any multicast groups. */
389 static struct nl_sock *rtnl_sock;
391 /* This is set pretty low because we probably won't learn anything from the
392 * additional log messages. */
393 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
395 static int netdev_linux_init(void);
397 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
398 int cmd, const char *cmd_name);
399 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
400 const char *cmd_name);
401 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
402 int cmd, const char *cmd_name);
403 static int get_flags(const struct netdev *, int *flagsp);
404 static int set_flags(struct netdev *, int flags);
405 static int do_get_ifindex(const char *netdev_name);
406 static int get_ifindex(const struct netdev *, int *ifindexp);
407 static int do_set_addr(struct netdev *netdev,
408 int ioctl_nr, const char *ioctl_name,
409 struct in_addr addr);
410 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
411 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
412 const uint8_t[ETH_ADDR_LEN]);
413 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
414 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
415 static int af_packet_sock(void);
416 static void netdev_linux_miimon_run(void);
417 static void netdev_linux_miimon_wait(void);
420 is_netdev_linux_class(const struct netdev_class *netdev_class)
422 return netdev_class->init == netdev_linux_init;
425 static struct netdev_dev_linux *
426 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
428 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429 assert(is_netdev_linux_class(netdev_class));
431 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
434 static struct netdev_linux *
435 netdev_linux_cast(const struct netdev *netdev)
437 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
438 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439 assert(is_netdev_linux_class(netdev_class));
441 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
445 netdev_linux_init(void)
447 static int status = -1;
449 /* Create AF_INET socket. */
450 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
451 status = af_inet_sock >= 0 ? 0 : errno;
453 VLOG_ERR("failed to create inet socket: %s", strerror(status));
456 /* Create rtnetlink socket. */
458 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
460 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
469 netdev_linux_run(void)
471 rtnetlink_link_run();
472 netdev_linux_miimon_run();
476 netdev_linux_wait(void)
478 rtnetlink_link_wait();
479 netdev_linux_miimon_wait();
483 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
486 if (!dev->change_seq) {
489 dev->cache_valid = 0;
493 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
494 void *aux OVS_UNUSED)
496 struct netdev_dev_linux *dev;
498 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
500 const struct netdev_class *netdev_class =
501 netdev_dev_get_class(base_dev);
503 if (is_netdev_linux_class(netdev_class)) {
504 dev = netdev_dev_linux_cast(base_dev);
505 netdev_dev_linux_changed(dev);
509 struct shash device_shash;
510 struct shash_node *node;
512 shash_init(&device_shash);
513 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
514 SHASH_FOR_EACH (node, &device_shash) {
516 netdev_dev_linux_changed(dev);
518 shash_destroy(&device_shash);
522 /* Creates system and internal devices. */
524 netdev_linux_create(const struct netdev_class *class, const char *name,
525 struct netdev_dev **netdev_devp)
527 struct netdev_dev_linux *netdev_dev;
529 if (!cache_notifier_refcount) {
530 assert(!netdev_linux_cache_notifier);
532 netdev_linux_cache_notifier =
533 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
535 if (!netdev_linux_cache_notifier) {
539 cache_notifier_refcount++;
541 netdev_dev = xzalloc(sizeof *netdev_dev);
542 netdev_dev->change_seq = 1;
543 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
545 *netdev_devp = &netdev_dev->netdev_dev;
549 /* For most types of netdevs we open the device for each call of
550 * netdev_open(). However, this is not the case with tap devices,
551 * since it is only possible to open the device once. In this
552 * situation we share a single file descriptor, and consequently
553 * buffers, across all readers. Therefore once data is read it will
554 * be unavailable to other reads for tap devices. */
556 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
557 const char *name, struct netdev_dev **netdev_devp)
559 struct netdev_dev_linux *netdev_dev;
560 struct tap_state *state;
561 static const char tap_dev[] = "/dev/net/tun";
565 netdev_dev = xzalloc(sizeof *netdev_dev);
566 state = &netdev_dev->state.tap;
568 /* Open tap device. */
569 state->fd = open(tap_dev, O_RDWR);
572 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
576 /* Create tap device. */
577 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
578 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
579 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
580 VLOG_WARN("%s: creating tap device failed: %s", name,
586 /* Make non-blocking. */
587 error = set_nonblocking(state->fd);
592 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
593 *netdev_devp = &netdev_dev->netdev_dev;
602 destroy_tap(struct netdev_dev_linux *netdev_dev)
604 struct tap_state *state = &netdev_dev->state.tap;
606 if (state->fd >= 0) {
611 /* Destroys the netdev device 'netdev_dev_'. */
613 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
615 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
616 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
618 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
619 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
622 if (class == &netdev_linux_class || class == &netdev_internal_class) {
623 cache_notifier_refcount--;
625 if (!cache_notifier_refcount) {
626 assert(netdev_linux_cache_notifier);
627 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
628 netdev_linux_cache_notifier = NULL;
630 } else if (class == &netdev_tap_class) {
631 destroy_tap(netdev_dev);
640 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
642 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
643 struct netdev_linux *netdev;
644 enum netdev_flags flags;
647 /* Allocate network device. */
648 netdev = xzalloc(sizeof *netdev);
650 netdev_init(&netdev->netdev, netdev_dev_);
652 /* Verify that the device really exists, by attempting to read its flags.
653 * (The flags might be cached, in which case this won't actually do an
656 * Don't do this for "internal" netdevs, though, because those have to be
657 * created as netdev objects before they exist in the kernel, because
658 * creating them in the kernel happens by passing a netdev object to
659 * dpif_port_add(). */
660 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
661 error = netdev_get_flags(&netdev->netdev, &flags);
662 if (error == ENODEV) {
667 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
668 !netdev_dev->state.tap.opened) {
670 /* We assume that the first user of the tap device is the primary user
671 * and give them the tap FD. Subsequent users probably just expect
672 * this to be a system device so open it normally to avoid send/receive
673 * directions appearing to be reversed. */
674 netdev->fd = netdev_dev->state.tap.fd;
675 netdev_dev->state.tap.opened = true;
678 *netdevp = &netdev->netdev;
682 netdev_uninit(&netdev->netdev, true);
686 /* Closes and destroys 'netdev'. */
688 netdev_linux_close(struct netdev *netdev_)
690 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
692 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
699 netdev_linux_listen(struct netdev *netdev_)
701 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
702 struct sockaddr_ll sll;
707 if (netdev->fd >= 0) {
711 /* Create file descriptor. */
712 fd = socket(PF_PACKET, SOCK_RAW, 0);
715 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
719 /* Set non-blocking mode. */
720 error = set_nonblocking(fd);
725 /* Get ethernet device index. */
726 error = get_ifindex(&netdev->netdev, &ifindex);
731 /* Bind to specific ethernet device. */
732 memset(&sll, 0, sizeof sll);
733 sll.sll_family = AF_PACKET;
734 sll.sll_ifindex = ifindex;
735 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
736 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
738 VLOG_ERR("%s: failed to bind raw socket (%s)",
739 netdev_get_name(netdev_), strerror(error));
754 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
756 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758 if (netdev->fd < 0) {
759 /* Device is not listening. */
764 ssize_t retval = read(netdev->fd, data, size);
767 } else if (errno != EINTR) {
768 if (errno != EAGAIN) {
769 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
770 strerror(errno), netdev_get_name(netdev_));
777 /* Registers with the poll loop to wake up from the next call to poll_block()
778 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
780 netdev_linux_recv_wait(struct netdev *netdev_)
782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
783 if (netdev->fd >= 0) {
784 poll_fd_wait(netdev->fd, POLLIN);
788 /* Discards all packets waiting to be received from 'netdev'. */
790 netdev_linux_drain(struct netdev *netdev_)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
793 if (netdev->fd < 0) {
795 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
797 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
798 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
802 drain_fd(netdev->fd, ifr.ifr_qlen);
805 return drain_rcvbuf(netdev->fd);
809 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
810 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
811 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
812 * the packet is too big or too small to transmit on the device.
814 * The caller retains ownership of 'buffer' in all cases.
816 * The kernel maintains a packet transmission queue, so the caller is not
817 * expected to do additional queuing of packets. */
819 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
821 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
825 if (netdev->fd < 0) {
826 /* Use our AF_PACKET socket to send to this device. */
827 struct sockaddr_ll sll;
834 sock = af_packet_sock();
839 error = get_ifindex(netdev_, &ifindex);
844 /* We don't bother setting most fields in sockaddr_ll because the
845 * kernel ignores them for SOCK_RAW. */
846 memset(&sll, 0, sizeof sll);
847 sll.sll_family = AF_PACKET;
848 sll.sll_ifindex = ifindex;
850 iov.iov_base = (void *) data;
854 msg.msg_namelen = sizeof sll;
857 msg.msg_control = NULL;
858 msg.msg_controllen = 0;
861 retval = sendmsg(sock, &msg, 0);
863 /* Use the netdev's own fd to send to this device. This is
864 * essential for tap devices, because packets sent to a tap device
865 * with an AF_PACKET socket will loop back to be *received* again
866 * on the tap device. */
867 retval = write(netdev->fd, data, size);
871 /* The Linux AF_PACKET implementation never blocks waiting for room
872 * for packets, instead returning ENOBUFS. Translate this into
873 * EAGAIN for the caller. */
874 if (errno == ENOBUFS) {
876 } else if (errno == EINTR) {
878 } else if (errno != EAGAIN) {
879 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
880 netdev_get_name(netdev_), strerror(errno));
883 } else if (retval != size) {
884 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
885 "%zu) on %s", retval, size, netdev_get_name(netdev_));
893 /* Registers with the poll loop to wake up from the next call to poll_block()
894 * when the packet transmission queue has sufficient room to transmit a packet
895 * with netdev_send().
897 * The kernel maintains a packet transmission queue, so the client is not
898 * expected to do additional queuing of packets. Thus, this function is
899 * unlikely to ever be used. It is included for completeness. */
901 netdev_linux_send_wait(struct netdev *netdev_)
903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
904 if (netdev->fd < 0) {
906 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
907 poll_fd_wait(netdev->fd, POLLOUT);
909 /* TAP device always accepts packets.*/
910 poll_immediate_wake();
914 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
915 * otherwise a positive errno value. */
917 netdev_linux_set_etheraddr(struct netdev *netdev_,
918 const uint8_t mac[ETH_ADDR_LEN])
920 struct netdev_dev_linux *netdev_dev =
921 netdev_dev_linux_cast(netdev_get_dev(netdev_));
924 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
925 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
926 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
928 netdev_dev->cache_valid |= VALID_ETHERADDR;
929 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
937 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
938 * free the returned buffer. */
940 netdev_linux_get_etheraddr(const struct netdev *netdev_,
941 uint8_t mac[ETH_ADDR_LEN])
943 struct netdev_dev_linux *netdev_dev =
944 netdev_dev_linux_cast(netdev_get_dev(netdev_));
945 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
946 int error = get_etheraddr(netdev_get_name(netdev_),
947 netdev_dev->etheraddr);
951 netdev_dev->cache_valid |= VALID_ETHERADDR;
953 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
957 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
958 * in bytes, not including the hardware header; thus, this is typically 1500
959 * bytes for Ethernet devices. */
961 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
963 struct netdev_dev_linux *netdev_dev =
964 netdev_dev_linux_cast(netdev_get_dev(netdev_));
965 if (!(netdev_dev->cache_valid & VALID_MTU)) {
969 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
970 SIOCGIFMTU, "SIOCGIFMTU");
974 netdev_dev->mtu = ifr.ifr_mtu;
975 netdev_dev->cache_valid |= VALID_MTU;
977 *mtup = netdev_dev->mtu;
981 /* Sets the maximum size of transmitted (MTU) for given device using linux
982 * networking ioctl interface.
985 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
987 struct netdev_dev_linux *netdev_dev =
988 netdev_dev_linux_cast(netdev_get_dev(netdev_));
993 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
994 SIOCSIFMTU, "SIOCSIFMTU");
999 netdev_dev->mtu = ifr.ifr_mtu;
1000 netdev_dev->cache_valid |= VALID_MTU;
1004 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1005 * On failure, returns a negative errno value. */
1007 netdev_linux_get_ifindex(const struct netdev *netdev)
1011 error = get_ifindex(netdev, &ifindex);
1012 return error ? -error : ifindex;
1016 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1018 struct netdev_dev_linux *netdev_dev =
1019 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1024 if (netdev_dev->miimon_interval > 0) {
1025 *carrier = netdev_dev->miimon;
1029 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1033 fn = xasprintf("/sys/class/net/%s/carrier",
1034 netdev_get_name(netdev_));
1035 fd = open(fn, O_RDONLY);
1038 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1042 retval = read(fd, line, sizeof line);
1045 if (error == EINVAL) {
1046 /* This is the normal return value when we try to check carrier
1047 * if the network device is not up. */
1049 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1052 } else if (retval == 0) {
1054 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1058 if (line[0] != '0' && line[0] != '1') {
1060 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1064 netdev_dev->carrier = line[0] != '0';
1065 netdev_dev->cache_valid |= VALID_CARRIER;
1067 *carrier = netdev_dev->carrier;
1079 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1080 struct mii_ioctl_data *data)
1085 memset(&ifr, 0, sizeof ifr);
1086 memcpy(&ifr.ifr_data, data, sizeof *data);
1087 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1088 memcpy(data, &ifr.ifr_data, sizeof *data);
1094 netdev_linux_get_miimon(const char *name, bool *miimon)
1096 struct mii_ioctl_data data;
1101 memset(&data, 0, sizeof data);
1102 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1104 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1105 data.reg_num = MII_BMSR;
1106 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1110 *miimon = !!(data.val_out & BMSR_LSTATUS);
1112 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1115 struct ethtool_cmd ecmd;
1117 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1120 memset(&ecmd, 0, sizeof ecmd);
1121 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1124 struct ethtool_value eval;
1126 memcpy(&eval, &ecmd, sizeof eval);
1127 *miimon = !!eval.data;
1129 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1137 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1138 long long int interval)
1140 struct netdev_dev_linux *netdev_dev;
1142 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1144 interval = interval > 0 ? MAX(interval, 100) : 0;
1145 if (netdev_dev->miimon_interval != interval) {
1146 netdev_dev->miimon_interval = interval;
1147 timer_set_expired(&netdev_dev->miimon_timer);
1154 netdev_linux_miimon_run(void)
1156 struct shash device_shash;
1157 struct shash_node *node;
1159 shash_init(&device_shash);
1160 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1161 SHASH_FOR_EACH (node, &device_shash) {
1162 struct netdev_dev_linux *dev = node->data;
1165 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1169 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1170 if (miimon != dev->miimon) {
1171 dev->miimon = miimon;
1172 netdev_dev_linux_changed(dev);
1175 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1178 shash_destroy(&device_shash);
1182 netdev_linux_miimon_wait(void)
1184 struct shash device_shash;
1185 struct shash_node *node;
1187 shash_init(&device_shash);
1188 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1189 SHASH_FOR_EACH (node, &device_shash) {
1190 struct netdev_dev_linux *dev = node->data;
1192 if (dev->miimon_interval > 0) {
1193 timer_wait(&dev->miimon_timer);
1196 shash_destroy(&device_shash);
1199 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1200 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1203 check_for_working_netlink_stats(void)
1205 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1206 * preferable, so if that works, we'll use it. */
1207 int ifindex = do_get_ifindex("lo");
1209 VLOG_WARN("failed to get ifindex for lo, "
1210 "obtaining netdev stats from proc");
1213 struct netdev_stats stats;
1214 int error = get_stats_via_netlink(ifindex, &stats);
1216 VLOG_DBG("obtaining netdev stats via rtnetlink");
1219 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1220 "via proc (you are probably running a pre-2.6.19 "
1221 "kernel)", strerror(error));
1228 swap_uint64(uint64_t *a, uint64_t *b)
1236 get_stats_via_vport(const struct netdev *netdev_,
1237 struct netdev_stats *stats)
1239 struct netdev_dev_linux *netdev_dev =
1240 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1242 if (netdev_dev->have_vport_stats ||
1243 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1246 error = netdev_vport_get_stats(netdev_, stats);
1248 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1249 netdev_get_name(netdev_), error);
1251 netdev_dev->have_vport_stats = !error;
1252 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1257 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1258 struct netdev_stats *stats)
1260 static int use_netlink_stats = -1;
1263 if (use_netlink_stats < 0) {
1264 use_netlink_stats = check_for_working_netlink_stats();
1267 if (use_netlink_stats) {
1270 error = get_ifindex(netdev_, &ifindex);
1272 error = get_stats_via_netlink(ifindex, stats);
1275 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1279 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1280 netdev_get_name(netdev_), error);
1286 /* Retrieves current device stats for 'netdev-linux'. */
1288 netdev_linux_get_stats(const struct netdev *netdev_,
1289 struct netdev_stats *stats)
1291 struct netdev_dev_linux *netdev_dev =
1292 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1293 struct netdev_stats dev_stats;
1296 get_stats_via_vport(netdev_, stats);
1298 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1301 if (!netdev_dev->have_vport_stats) {
1308 if (!netdev_dev->have_vport_stats) {
1309 /* stats not available from OVS then use ioctl stats. */
1312 stats->rx_errors += dev_stats.rx_errors;
1313 stats->tx_errors += dev_stats.tx_errors;
1314 stats->rx_dropped += dev_stats.rx_dropped;
1315 stats->tx_dropped += dev_stats.tx_dropped;
1316 stats->multicast += dev_stats.multicast;
1317 stats->collisions += dev_stats.collisions;
1318 stats->rx_length_errors += dev_stats.rx_length_errors;
1319 stats->rx_over_errors += dev_stats.rx_over_errors;
1320 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1321 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1322 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1323 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1324 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1325 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1326 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1327 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1328 stats->tx_window_errors += dev_stats.tx_window_errors;
1333 /* Retrieves current device stats for 'netdev-tap' netdev or
1334 * netdev-internal. */
1336 netdev_pseudo_get_stats(const struct netdev *netdev_,
1337 struct netdev_stats *stats)
1339 struct netdev_dev_linux *netdev_dev =
1340 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1341 struct netdev_stats dev_stats;
1344 get_stats_via_vport(netdev_, stats);
1346 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1348 if (!netdev_dev->have_vport_stats) {
1355 /* If this port is an internal port then the transmit and receive stats
1356 * will appear to be swapped relative to the other ports since we are the
1357 * one sending the data, not a remote computer. For consistency, we swap
1358 * them back here. This does not apply if we are getting stats from the
1359 * vport layer because it always tracks stats from the perspective of the
1361 if (!netdev_dev->have_vport_stats) {
1363 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1364 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1365 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1366 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1367 stats->rx_length_errors = 0;
1368 stats->rx_over_errors = 0;
1369 stats->rx_crc_errors = 0;
1370 stats->rx_frame_errors = 0;
1371 stats->rx_fifo_errors = 0;
1372 stats->rx_missed_errors = 0;
1373 stats->tx_aborted_errors = 0;
1374 stats->tx_carrier_errors = 0;
1375 stats->tx_fifo_errors = 0;
1376 stats->tx_heartbeat_errors = 0;
1377 stats->tx_window_errors = 0;
1379 stats->rx_dropped += dev_stats.tx_dropped;
1380 stats->tx_dropped += dev_stats.rx_dropped;
1382 stats->rx_errors += dev_stats.tx_errors;
1383 stats->tx_errors += dev_stats.rx_errors;
1385 stats->multicast += dev_stats.multicast;
1386 stats->collisions += dev_stats.collisions;
1391 /* Stores the features supported by 'netdev' into each of '*current',
1392 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1393 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1394 * successful, otherwise a positive errno value. */
1396 netdev_linux_get_features(const struct netdev *netdev,
1397 uint32_t *current, uint32_t *advertised,
1398 uint32_t *supported, uint32_t *peer)
1400 struct ethtool_cmd ecmd;
1403 memset(&ecmd, 0, sizeof ecmd);
1404 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1405 ETHTOOL_GSET, "ETHTOOL_GSET");
1410 /* Supported features. */
1412 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1413 *supported |= OFPPF_10MB_HD;
1415 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1416 *supported |= OFPPF_10MB_FD;
1418 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1419 *supported |= OFPPF_100MB_HD;
1421 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1422 *supported |= OFPPF_100MB_FD;
1424 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1425 *supported |= OFPPF_1GB_HD;
1427 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1428 *supported |= OFPPF_1GB_FD;
1430 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1431 *supported |= OFPPF_10GB_FD;
1433 if (ecmd.supported & SUPPORTED_TP) {
1434 *supported |= OFPPF_COPPER;
1436 if (ecmd.supported & SUPPORTED_FIBRE) {
1437 *supported |= OFPPF_FIBER;
1439 if (ecmd.supported & SUPPORTED_Autoneg) {
1440 *supported |= OFPPF_AUTONEG;
1442 if (ecmd.supported & SUPPORTED_Pause) {
1443 *supported |= OFPPF_PAUSE;
1445 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1446 *supported |= OFPPF_PAUSE_ASYM;
1449 /* Advertised features. */
1451 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1452 *advertised |= OFPPF_10MB_HD;
1454 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1455 *advertised |= OFPPF_10MB_FD;
1457 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1458 *advertised |= OFPPF_100MB_HD;
1460 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1461 *advertised |= OFPPF_100MB_FD;
1463 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1464 *advertised |= OFPPF_1GB_HD;
1466 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1467 *advertised |= OFPPF_1GB_FD;
1469 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1470 *advertised |= OFPPF_10GB_FD;
1472 if (ecmd.advertising & ADVERTISED_TP) {
1473 *advertised |= OFPPF_COPPER;
1475 if (ecmd.advertising & ADVERTISED_FIBRE) {
1476 *advertised |= OFPPF_FIBER;
1478 if (ecmd.advertising & ADVERTISED_Autoneg) {
1479 *advertised |= OFPPF_AUTONEG;
1481 if (ecmd.advertising & ADVERTISED_Pause) {
1482 *advertised |= OFPPF_PAUSE;
1484 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1485 *advertised |= OFPPF_PAUSE_ASYM;
1488 /* Current settings. */
1489 if (ecmd.speed == SPEED_10) {
1490 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1491 } else if (ecmd.speed == SPEED_100) {
1492 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1493 } else if (ecmd.speed == SPEED_1000) {
1494 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1495 } else if (ecmd.speed == SPEED_10000) {
1496 *current = OFPPF_10GB_FD;
1501 if (ecmd.port == PORT_TP) {
1502 *current |= OFPPF_COPPER;
1503 } else if (ecmd.port == PORT_FIBRE) {
1504 *current |= OFPPF_FIBER;
1508 *current |= OFPPF_AUTONEG;
1511 /* Peer advertisements. */
1512 *peer = 0; /* XXX */
1517 /* Set the features advertised by 'netdev' to 'advertise'. */
1519 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1521 struct ethtool_cmd ecmd;
1524 memset(&ecmd, 0, sizeof ecmd);
1525 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1526 ETHTOOL_GSET, "ETHTOOL_GSET");
1531 ecmd.advertising = 0;
1532 if (advertise & OFPPF_10MB_HD) {
1533 ecmd.advertising |= ADVERTISED_10baseT_Half;
1535 if (advertise & OFPPF_10MB_FD) {
1536 ecmd.advertising |= ADVERTISED_10baseT_Full;
1538 if (advertise & OFPPF_100MB_HD) {
1539 ecmd.advertising |= ADVERTISED_100baseT_Half;
1541 if (advertise & OFPPF_100MB_FD) {
1542 ecmd.advertising |= ADVERTISED_100baseT_Full;
1544 if (advertise & OFPPF_1GB_HD) {
1545 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1547 if (advertise & OFPPF_1GB_FD) {
1548 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1550 if (advertise & OFPPF_10GB_FD) {
1551 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1553 if (advertise & OFPPF_COPPER) {
1554 ecmd.advertising |= ADVERTISED_TP;
1556 if (advertise & OFPPF_FIBER) {
1557 ecmd.advertising |= ADVERTISED_FIBRE;
1559 if (advertise & OFPPF_AUTONEG) {
1560 ecmd.advertising |= ADVERTISED_Autoneg;
1562 if (advertise & OFPPF_PAUSE) {
1563 ecmd.advertising |= ADVERTISED_Pause;
1565 if (advertise & OFPPF_PAUSE_ASYM) {
1566 ecmd.advertising |= ADVERTISED_Asym_Pause;
1568 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1569 ETHTOOL_SSET, "ETHTOOL_SSET");
1572 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1573 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1574 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1575 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1576 * sets '*vlan_vid' to -1. */
1578 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1580 const char *netdev_name = netdev_get_name(netdev);
1581 struct ds line = DS_EMPTY_INITIALIZER;
1582 FILE *stream = NULL;
1586 COVERAGE_INC(netdev_get_vlan_vid);
1587 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1588 stream = fopen(fn, "r");
1594 if (ds_get_line(&line, stream)) {
1595 if (ferror(stream)) {
1597 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1600 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1605 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1607 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1608 fn, ds_cstr(&line));
1626 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1627 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1629 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1630 * positive errno value.
1632 * This function is equivalent to running
1633 * /sbin/tc qdisc del dev %s handle ffff: ingress
1634 * but it is much, much faster.
1637 netdev_linux_remove_policing(struct netdev *netdev)
1639 struct netdev_dev_linux *netdev_dev =
1640 netdev_dev_linux_cast(netdev_get_dev(netdev));
1641 const char *netdev_name = netdev_get_name(netdev);
1643 struct ofpbuf request;
1644 struct tcmsg *tcmsg;
1647 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1651 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1652 tcmsg->tcm_parent = TC_H_INGRESS;
1653 nl_msg_put_string(&request, TCA_KIND, "ingress");
1654 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1656 error = tc_transact(&request, NULL);
1657 if (error && error != ENOENT && error != EINVAL) {
1658 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1659 netdev_name, strerror(error));
1663 netdev_dev->kbits_rate = 0;
1664 netdev_dev->kbits_burst = 0;
1665 netdev_dev->cache_valid |= VALID_POLICING;
1669 /* Attempts to set input rate limiting (policing) policy. */
1671 netdev_linux_set_policing(struct netdev *netdev,
1672 uint32_t kbits_rate, uint32_t kbits_burst)
1674 struct netdev_dev_linux *netdev_dev =
1675 netdev_dev_linux_cast(netdev_get_dev(netdev));
1676 const char *netdev_name = netdev_get_name(netdev);
1679 COVERAGE_INC(netdev_set_policing);
1681 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1682 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1683 : kbits_burst); /* Stick with user-specified value. */
1685 if (netdev_dev->cache_valid & VALID_POLICING
1686 && netdev_dev->kbits_rate == kbits_rate
1687 && netdev_dev->kbits_burst == kbits_burst) {
1688 /* Assume that settings haven't changed since we last set them. */
1692 netdev_linux_remove_policing(netdev);
1694 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1695 if (system(command) != 0) {
1696 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1700 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1701 kbits_rate, kbits_burst);
1702 if (system(command) != 0) {
1703 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1708 netdev_dev->kbits_rate = kbits_rate;
1709 netdev_dev->kbits_burst = kbits_burst;
1710 netdev_dev->cache_valid |= VALID_POLICING;
1717 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1720 const struct tc_ops **opsp;
1722 for (opsp = tcs; *opsp != NULL; opsp++) {
1723 const struct tc_ops *ops = *opsp;
1724 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1725 sset_add(types, ops->ovs_name);
1731 static const struct tc_ops *
1732 tc_lookup_ovs_name(const char *name)
1734 const struct tc_ops **opsp;
1736 for (opsp = tcs; *opsp != NULL; opsp++) {
1737 const struct tc_ops *ops = *opsp;
1738 if (!strcmp(name, ops->ovs_name)) {
1745 static const struct tc_ops *
1746 tc_lookup_linux_name(const char *name)
1748 const struct tc_ops **opsp;
1750 for (opsp = tcs; *opsp != NULL; opsp++) {
1751 const struct tc_ops *ops = *opsp;
1752 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1759 static struct tc_queue *
1760 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1763 struct netdev_dev_linux *netdev_dev =
1764 netdev_dev_linux_cast(netdev_get_dev(netdev));
1765 struct tc_queue *queue;
1767 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1768 if (queue->queue_id == queue_id) {
1775 static struct tc_queue *
1776 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1778 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1782 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1784 struct netdev_qos_capabilities *caps)
1786 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1790 caps->n_queues = ops->n_queues;
1795 netdev_linux_get_qos(const struct netdev *netdev,
1796 const char **typep, struct shash *details)
1798 struct netdev_dev_linux *netdev_dev =
1799 netdev_dev_linux_cast(netdev_get_dev(netdev));
1802 error = tc_query_qdisc(netdev);
1807 *typep = netdev_dev->tc->ops->ovs_name;
1808 return (netdev_dev->tc->ops->qdisc_get
1809 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1814 netdev_linux_set_qos(struct netdev *netdev,
1815 const char *type, const struct shash *details)
1817 struct netdev_dev_linux *netdev_dev =
1818 netdev_dev_linux_cast(netdev_get_dev(netdev));
1819 const struct tc_ops *new_ops;
1822 new_ops = tc_lookup_ovs_name(type);
1823 if (!new_ops || !new_ops->tc_install) {
1827 error = tc_query_qdisc(netdev);
1832 if (new_ops == netdev_dev->tc->ops) {
1833 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1835 /* Delete existing qdisc. */
1836 error = tc_del_qdisc(netdev);
1840 assert(netdev_dev->tc == NULL);
1842 /* Install new qdisc. */
1843 error = new_ops->tc_install(netdev, details);
1844 assert((error == 0) == (netdev_dev->tc != NULL));
1851 netdev_linux_get_queue(const struct netdev *netdev,
1852 unsigned int queue_id, struct shash *details)
1854 struct netdev_dev_linux *netdev_dev =
1855 netdev_dev_linux_cast(netdev_get_dev(netdev));
1858 error = tc_query_qdisc(netdev);
1862 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1864 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1870 netdev_linux_set_queue(struct netdev *netdev,
1871 unsigned int queue_id, const struct shash *details)
1873 struct netdev_dev_linux *netdev_dev =
1874 netdev_dev_linux_cast(netdev_get_dev(netdev));
1877 error = tc_query_qdisc(netdev);
1880 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1881 || !netdev_dev->tc->ops->class_set) {
1885 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1889 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1891 struct netdev_dev_linux *netdev_dev =
1892 netdev_dev_linux_cast(netdev_get_dev(netdev));
1895 error = tc_query_qdisc(netdev);
1898 } else if (!netdev_dev->tc->ops->class_delete) {
1901 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1903 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1909 netdev_linux_get_queue_stats(const struct netdev *netdev,
1910 unsigned int queue_id,
1911 struct netdev_queue_stats *stats)
1913 struct netdev_dev_linux *netdev_dev =
1914 netdev_dev_linux_cast(netdev_get_dev(netdev));
1917 error = tc_query_qdisc(netdev);
1920 } else if (!netdev_dev->tc->ops->class_get_stats) {
1923 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1925 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1931 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1933 struct ofpbuf request;
1934 struct tcmsg *tcmsg;
1936 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1940 tcmsg->tcm_parent = 0;
1941 nl_dump_start(dump, rtnl_sock, &request);
1942 ofpbuf_uninit(&request);
1947 netdev_linux_dump_queues(const struct netdev *netdev,
1948 netdev_dump_queues_cb *cb, void *aux)
1950 struct netdev_dev_linux *netdev_dev =
1951 netdev_dev_linux_cast(netdev_get_dev(netdev));
1952 struct tc_queue *queue;
1953 struct shash details;
1957 error = tc_query_qdisc(netdev);
1960 } else if (!netdev_dev->tc->ops->class_get) {
1965 shash_init(&details);
1966 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1967 shash_clear(&details);
1969 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1971 (*cb)(queue->queue_id, &details, aux);
1976 shash_destroy(&details);
1982 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1983 netdev_dump_queue_stats_cb *cb, void *aux)
1985 struct netdev_dev_linux *netdev_dev =
1986 netdev_dev_linux_cast(netdev_get_dev(netdev));
1987 struct nl_dump dump;
1992 error = tc_query_qdisc(netdev);
1995 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2000 if (!start_queue_dump(netdev, &dump)) {
2003 while (nl_dump_next(&dump, &msg)) {
2004 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2010 error = nl_dump_done(&dump);
2011 return error ? error : last_error;
2015 netdev_linux_get_in4(const struct netdev *netdev_,
2016 struct in_addr *address, struct in_addr *netmask)
2018 struct netdev_dev_linux *netdev_dev =
2019 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2021 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2024 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2025 SIOCGIFADDR, "SIOCGIFADDR");
2030 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2031 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2036 netdev_dev->cache_valid |= VALID_IN4;
2038 *address = netdev_dev->address;
2039 *netmask = netdev_dev->netmask;
2040 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2044 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2045 struct in_addr netmask)
2047 struct netdev_dev_linux *netdev_dev =
2048 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2051 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2053 netdev_dev->cache_valid |= VALID_IN4;
2054 netdev_dev->address = address;
2055 netdev_dev->netmask = netmask;
2056 if (address.s_addr != INADDR_ANY) {
2057 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2058 "SIOCSIFNETMASK", netmask);
2065 parse_if_inet6_line(const char *line,
2066 struct in6_addr *in6, char ifname[16 + 1])
2068 uint8_t *s6 = in6->s6_addr;
2069 #define X8 "%2"SCNx8
2071 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2072 "%*x %*x %*x %*x %16s\n",
2073 &s6[0], &s6[1], &s6[2], &s6[3],
2074 &s6[4], &s6[5], &s6[6], &s6[7],
2075 &s6[8], &s6[9], &s6[10], &s6[11],
2076 &s6[12], &s6[13], &s6[14], &s6[15],
2080 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2081 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2083 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2085 struct netdev_dev_linux *netdev_dev =
2086 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2087 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2091 netdev_dev->in6 = in6addr_any;
2093 file = fopen("/proc/net/if_inet6", "r");
2095 const char *name = netdev_get_name(netdev_);
2096 while (fgets(line, sizeof line, file)) {
2097 struct in6_addr in6_tmp;
2098 char ifname[16 + 1];
2099 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2100 && !strcmp(name, ifname))
2102 netdev_dev->in6 = in6_tmp;
2108 netdev_dev->cache_valid |= VALID_IN6;
2110 *in6 = netdev_dev->in6;
2115 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2117 struct sockaddr_in sin;
2118 memset(&sin, 0, sizeof sin);
2119 sin.sin_family = AF_INET;
2120 sin.sin_addr = addr;
2123 memset(sa, 0, sizeof *sa);
2124 memcpy(sa, &sin, sizeof sin);
2128 do_set_addr(struct netdev *netdev,
2129 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2132 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2133 make_in4_sockaddr(&ifr.ifr_addr, addr);
2135 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2139 /* Adds 'router' as a default IP gateway. */
2141 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2143 struct in_addr any = { INADDR_ANY };
2147 memset(&rt, 0, sizeof rt);
2148 make_in4_sockaddr(&rt.rt_dst, any);
2149 make_in4_sockaddr(&rt.rt_gateway, router);
2150 make_in4_sockaddr(&rt.rt_genmask, any);
2151 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2152 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2154 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2160 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2163 static const char fn[] = "/proc/net/route";
2168 *netdev_name = NULL;
2169 stream = fopen(fn, "r");
2170 if (stream == NULL) {
2171 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2176 while (fgets(line, sizeof line, stream)) {
2179 ovs_be32 dest, gateway, mask;
2180 int refcnt, metric, mtu;
2181 unsigned int flags, use, window, irtt;
2184 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2186 iface, &dest, &gateway, &flags, &refcnt,
2187 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2189 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2193 if (!(flags & RTF_UP)) {
2194 /* Skip routes that aren't up. */
2198 /* The output of 'dest', 'mask', and 'gateway' were given in
2199 * network byte order, so we don't need need any endian
2200 * conversions here. */
2201 if ((dest & mask) == (host->s_addr & mask)) {
2203 /* The host is directly reachable. */
2204 next_hop->s_addr = 0;
2206 /* To reach the host, we must go through a gateway. */
2207 next_hop->s_addr = gateway;
2209 *netdev_name = xstrdup(iface);
2221 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2223 struct ethtool_drvinfo drvinfo;
2226 memset(&drvinfo, 0, sizeof drvinfo);
2227 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2228 (struct ethtool_cmd *)&drvinfo,
2230 "ETHTOOL_GDRVINFO");
2232 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2233 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2234 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2240 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2241 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2242 * returns 0. Otherwise, it returns a positive errno value; in particular,
2243 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2245 netdev_linux_arp_lookup(const struct netdev *netdev,
2246 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2249 struct sockaddr_in sin;
2252 memset(&r, 0, sizeof r);
2253 memset(&sin, 0, sizeof sin);
2254 sin.sin_family = AF_INET;
2255 sin.sin_addr.s_addr = ip;
2257 memcpy(&r.arp_pa, &sin, sizeof sin);
2258 r.arp_ha.sa_family = ARPHRD_ETHER;
2260 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2261 COVERAGE_INC(netdev_arp_lookup);
2262 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2264 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2265 } else if (retval != ENXIO) {
2266 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2267 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2273 nd_to_iff_flags(enum netdev_flags nd)
2276 if (nd & NETDEV_UP) {
2279 if (nd & NETDEV_PROMISC) {
2286 iff_to_nd_flags(int iff)
2288 enum netdev_flags nd = 0;
2292 if (iff & IFF_PROMISC) {
2293 nd |= NETDEV_PROMISC;
2299 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2300 enum netdev_flags on, enum netdev_flags *old_flagsp)
2302 int old_flags, new_flags;
2305 error = get_flags(netdev, &old_flags);
2307 *old_flagsp = iff_to_nd_flags(old_flags);
2308 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2309 if (new_flags != old_flags) {
2310 error = set_flags(netdev, new_flags);
2317 netdev_linux_change_seq(const struct netdev *netdev)
2319 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2322 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2326 netdev_linux_init, \
2328 netdev_linux_wait, \
2331 netdev_linux_destroy, \
2332 NULL, /* get_config */ \
2333 NULL, /* set_config */ \
2335 netdev_linux_open, \
2336 netdev_linux_close, \
2338 netdev_linux_listen, \
2339 netdev_linux_recv, \
2340 netdev_linux_recv_wait, \
2341 netdev_linux_drain, \
2343 netdev_linux_send, \
2344 netdev_linux_send_wait, \
2346 netdev_linux_set_etheraddr, \
2347 netdev_linux_get_etheraddr, \
2348 netdev_linux_get_mtu, \
2349 netdev_linux_set_mtu, \
2350 netdev_linux_get_ifindex, \
2351 netdev_linux_get_carrier, \
2352 netdev_linux_set_miimon_interval, \
2356 netdev_linux_get_features, \
2357 netdev_linux_set_advertisements, \
2358 netdev_linux_get_vlan_vid, \
2360 netdev_linux_set_policing, \
2361 netdev_linux_get_qos_types, \
2362 netdev_linux_get_qos_capabilities, \
2363 netdev_linux_get_qos, \
2364 netdev_linux_set_qos, \
2365 netdev_linux_get_queue, \
2366 netdev_linux_set_queue, \
2367 netdev_linux_delete_queue, \
2368 netdev_linux_get_queue_stats, \
2369 netdev_linux_dump_queues, \
2370 netdev_linux_dump_queue_stats, \
2372 netdev_linux_get_in4, \
2373 netdev_linux_set_in4, \
2374 netdev_linux_get_in6, \
2375 netdev_linux_add_router, \
2376 netdev_linux_get_next_hop, \
2377 netdev_linux_get_status, \
2378 netdev_linux_arp_lookup, \
2380 netdev_linux_update_flags, \
2382 netdev_linux_change_seq \
2385 const struct netdev_class netdev_linux_class =
2388 netdev_linux_create,
2389 netdev_linux_get_stats,
2390 NULL); /* set_stats */
2392 const struct netdev_class netdev_tap_class =
2395 netdev_linux_create_tap,
2396 netdev_pseudo_get_stats,
2397 NULL); /* set_stats */
2399 const struct netdev_class netdev_internal_class =
2402 netdev_linux_create,
2403 netdev_pseudo_get_stats,
2404 netdev_vport_set_stats);
2406 /* HTB traffic control class. */
2408 #define HTB_N_QUEUES 0xf000
2412 unsigned int max_rate; /* In bytes/s. */
2416 struct tc_queue tc_queue;
2417 unsigned int min_rate; /* In bytes/s. */
2418 unsigned int max_rate; /* In bytes/s. */
2419 unsigned int burst; /* In bytes. */
2420 unsigned int priority; /* Lower values are higher priorities. */
2424 htb_get__(const struct netdev *netdev)
2426 struct netdev_dev_linux *netdev_dev =
2427 netdev_dev_linux_cast(netdev_get_dev(netdev));
2428 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2432 htb_install__(struct netdev *netdev, uint64_t max_rate)
2434 struct netdev_dev_linux *netdev_dev =
2435 netdev_dev_linux_cast(netdev_get_dev(netdev));
2438 htb = xmalloc(sizeof *htb);
2439 tc_init(&htb->tc, &tc_ops_htb);
2440 htb->max_rate = max_rate;
2442 netdev_dev->tc = &htb->tc;
2445 /* Create an HTB qdisc.
2447 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2449 htb_setup_qdisc__(struct netdev *netdev)
2452 struct tc_htb_glob opt;
2453 struct ofpbuf request;
2454 struct tcmsg *tcmsg;
2456 tc_del_qdisc(netdev);
2458 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2459 NLM_F_EXCL | NLM_F_CREATE, &request);
2463 tcmsg->tcm_handle = tc_make_handle(1, 0);
2464 tcmsg->tcm_parent = TC_H_ROOT;
2466 nl_msg_put_string(&request, TCA_KIND, "htb");
2468 memset(&opt, 0, sizeof opt);
2469 opt.rate2quantum = 10;
2473 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2474 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2475 nl_msg_end_nested(&request, opt_offset);
2477 return tc_transact(&request, NULL);
2480 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2481 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2483 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2484 unsigned int parent, struct htb_class *class)
2487 struct tc_htb_opt opt;
2488 struct ofpbuf request;
2489 struct tcmsg *tcmsg;
2493 error = netdev_get_mtu(netdev, &mtu);
2495 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2496 netdev_get_name(netdev));
2500 memset(&opt, 0, sizeof opt);
2501 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2502 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2503 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2504 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2505 opt.prio = class->priority;
2507 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2511 tcmsg->tcm_handle = handle;
2512 tcmsg->tcm_parent = parent;
2514 nl_msg_put_string(&request, TCA_KIND, "htb");
2515 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2516 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2517 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2518 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2519 nl_msg_end_nested(&request, opt_offset);
2521 error = tc_transact(&request, NULL);
2523 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2524 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2525 netdev_get_name(netdev),
2526 tc_get_major(handle), tc_get_minor(handle),
2527 tc_get_major(parent), tc_get_minor(parent),
2528 class->min_rate, class->max_rate,
2529 class->burst, class->priority, strerror(error));
2534 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2535 * description of them into 'details'. The description complies with the
2536 * specification given in the vswitch database documentation for linux-htb
2539 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2541 static const struct nl_policy tca_htb_policy[] = {
2542 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2543 .min_len = sizeof(struct tc_htb_opt) },
2546 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2547 const struct tc_htb_opt *htb;
2549 if (!nl_parse_nested(nl_options, tca_htb_policy,
2550 attrs, ARRAY_SIZE(tca_htb_policy))) {
2551 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2555 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2556 class->min_rate = htb->rate.rate;
2557 class->max_rate = htb->ceil.rate;
2558 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2559 class->priority = htb->prio;
2564 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2565 struct htb_class *options,
2566 struct netdev_queue_stats *stats)
2568 struct nlattr *nl_options;
2569 unsigned int handle;
2572 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2573 if (!error && queue_id) {
2574 unsigned int major = tc_get_major(handle);
2575 unsigned int minor = tc_get_minor(handle);
2576 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2577 *queue_id = minor - 1;
2582 if (!error && options) {
2583 error = htb_parse_tca_options__(nl_options, options);
2589 htb_parse_qdisc_details__(struct netdev *netdev,
2590 const struct shash *details, struct htb_class *hc)
2592 const char *max_rate_s;
2594 max_rate_s = shash_find_data(details, "max-rate");
2595 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2596 if (!hc->max_rate) {
2599 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2600 hc->max_rate = netdev_features_to_bps(current) / 8;
2602 hc->min_rate = hc->max_rate;
2608 htb_parse_class_details__(struct netdev *netdev,
2609 const struct shash *details, struct htb_class *hc)
2611 const struct htb *htb = htb_get__(netdev);
2612 const char *min_rate_s = shash_find_data(details, "min-rate");
2613 const char *max_rate_s = shash_find_data(details, "max-rate");
2614 const char *burst_s = shash_find_data(details, "burst");
2615 const char *priority_s = shash_find_data(details, "priority");
2618 error = netdev_get_mtu(netdev, &mtu);
2620 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2621 netdev_get_name(netdev));
2625 /* HTB requires at least an mtu sized min-rate to send any traffic even
2626 * on uncongested links. */
2627 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2628 hc->min_rate = MAX(hc->min_rate, mtu);
2629 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2632 hc->max_rate = (max_rate_s
2633 ? strtoull(max_rate_s, NULL, 10) / 8
2635 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2636 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2640 * According to hints in the documentation that I've read, it is important
2641 * that 'burst' be at least as big as the largest frame that might be
2642 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2643 * but having it a bit too small is a problem. Since netdev_get_mtu()
2644 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2645 * the MTU. We actually add 64, instead of 14, as a guard against
2646 * additional headers get tacked on somewhere that we're not aware of. */
2647 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2648 hc->burst = MAX(hc->burst, mtu + 64);
2651 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2657 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2658 unsigned int parent, struct htb_class *options,
2659 struct netdev_queue_stats *stats)
2661 struct ofpbuf *reply;
2664 error = tc_query_class(netdev, handle, parent, &reply);
2666 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2667 ofpbuf_delete(reply);
2673 htb_tc_install(struct netdev *netdev, const struct shash *details)
2677 error = htb_setup_qdisc__(netdev);
2679 struct htb_class hc;
2681 htb_parse_qdisc_details__(netdev, details, &hc);
2682 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2683 tc_make_handle(1, 0), &hc);
2685 htb_install__(netdev, hc.max_rate);
2691 static struct htb_class *
2692 htb_class_cast__(const struct tc_queue *queue)
2694 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2698 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2699 const struct htb_class *hc)
2701 struct htb *htb = htb_get__(netdev);
2702 size_t hash = hash_int(queue_id, 0);
2703 struct tc_queue *queue;
2704 struct htb_class *hcp;
2706 queue = tc_find_queue__(netdev, queue_id, hash);
2708 hcp = htb_class_cast__(queue);
2710 hcp = xmalloc(sizeof *hcp);
2711 queue = &hcp->tc_queue;
2712 queue->queue_id = queue_id;
2713 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2716 hcp->min_rate = hc->min_rate;
2717 hcp->max_rate = hc->max_rate;
2718 hcp->burst = hc->burst;
2719 hcp->priority = hc->priority;
2723 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2726 struct nl_dump dump;
2727 struct htb_class hc;
2729 /* Get qdisc options. */
2731 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2732 htb_install__(netdev, hc.max_rate);
2735 if (!start_queue_dump(netdev, &dump)) {
2738 while (nl_dump_next(&dump, &msg)) {
2739 unsigned int queue_id;
2741 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2742 htb_update_queue__(netdev, queue_id, &hc);
2745 nl_dump_done(&dump);
2751 htb_tc_destroy(struct tc *tc)
2753 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2754 struct htb_class *hc, *next;
2756 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2757 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2765 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2767 const struct htb *htb = htb_get__(netdev);
2768 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2773 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2775 struct htb_class hc;
2778 htb_parse_qdisc_details__(netdev, details, &hc);
2779 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2780 tc_make_handle(1, 0), &hc);
2782 htb_get__(netdev)->max_rate = hc.max_rate;
2788 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2789 const struct tc_queue *queue, struct shash *details)
2791 const struct htb_class *hc = htb_class_cast__(queue);
2793 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2794 if (hc->min_rate != hc->max_rate) {
2795 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2797 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2799 shash_add(details, "priority", xasprintf("%u", hc->priority));
2805 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2806 const struct shash *details)
2808 struct htb_class hc;
2811 error = htb_parse_class_details__(netdev, details, &hc);
2816 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2817 tc_make_handle(1, 0xfffe), &hc);
2822 htb_update_queue__(netdev, queue_id, &hc);
2827 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2829 struct htb_class *hc = htb_class_cast__(queue);
2830 struct htb *htb = htb_get__(netdev);
2833 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2835 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2842 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2843 struct netdev_queue_stats *stats)
2845 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2846 tc_make_handle(1, 0xfffe), NULL, stats);
2850 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2851 const struct ofpbuf *nlmsg,
2852 netdev_dump_queue_stats_cb *cb, void *aux)
2854 struct netdev_queue_stats stats;
2855 unsigned int handle, major, minor;
2858 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2863 major = tc_get_major(handle);
2864 minor = tc_get_minor(handle);
2865 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2866 (*cb)(minor - 1, &stats, aux);
2871 static const struct tc_ops tc_ops_htb = {
2872 "htb", /* linux_name */
2873 "linux-htb", /* ovs_name */
2874 HTB_N_QUEUES, /* n_queues */
2883 htb_class_get_stats,
2884 htb_class_dump_stats
2887 /* "linux-hfsc" traffic control class. */
2889 #define HFSC_N_QUEUES 0xf000
2897 struct tc_queue tc_queue;
2902 static struct hfsc *
2903 hfsc_get__(const struct netdev *netdev)
2905 struct netdev_dev_linux *netdev_dev;
2906 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2907 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2910 static struct hfsc_class *
2911 hfsc_class_cast__(const struct tc_queue *queue)
2913 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2917 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2919 struct netdev_dev_linux * netdev_dev;
2922 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2923 hfsc = xmalloc(sizeof *hfsc);
2924 tc_init(&hfsc->tc, &tc_ops_hfsc);
2925 hfsc->max_rate = max_rate;
2926 netdev_dev->tc = &hfsc->tc;
2930 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2931 const struct hfsc_class *hc)
2935 struct hfsc_class *hcp;
2936 struct tc_queue *queue;
2938 hfsc = hfsc_get__(netdev);
2939 hash = hash_int(queue_id, 0);
2941 queue = tc_find_queue__(netdev, queue_id, hash);
2943 hcp = hfsc_class_cast__(queue);
2945 hcp = xmalloc(sizeof *hcp);
2946 queue = &hcp->tc_queue;
2947 queue->queue_id = queue_id;
2948 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2951 hcp->min_rate = hc->min_rate;
2952 hcp->max_rate = hc->max_rate;
2956 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2958 const struct tc_service_curve *rsc, *fsc, *usc;
2959 static const struct nl_policy tca_hfsc_policy[] = {
2961 .type = NL_A_UNSPEC,
2963 .min_len = sizeof(struct tc_service_curve),
2966 .type = NL_A_UNSPEC,
2968 .min_len = sizeof(struct tc_service_curve),
2971 .type = NL_A_UNSPEC,
2973 .min_len = sizeof(struct tc_service_curve),
2976 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2978 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2979 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2980 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2984 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2985 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2986 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2988 if (rsc->m1 != 0 || rsc->d != 0 ||
2989 fsc->m1 != 0 || fsc->d != 0 ||
2990 usc->m1 != 0 || usc->d != 0) {
2991 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2992 "Non-linear service curves are not supported.");
2996 if (rsc->m2 != fsc->m2) {
2997 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2998 "Real-time service curves are not supported ");
3002 if (rsc->m2 > usc->m2) {
3003 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3004 "Min-rate service curve is greater than "
3005 "the max-rate service curve.");
3009 class->min_rate = fsc->m2;
3010 class->max_rate = usc->m2;
3015 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3016 struct hfsc_class *options,
3017 struct netdev_queue_stats *stats)
3020 unsigned int handle;
3021 struct nlattr *nl_options;
3023 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3029 unsigned int major, minor;
3031 major = tc_get_major(handle);
3032 minor = tc_get_minor(handle);
3033 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3034 *queue_id = minor - 1;
3041 error = hfsc_parse_tca_options__(nl_options, options);
3048 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3049 unsigned int parent, struct hfsc_class *options,
3050 struct netdev_queue_stats *stats)
3053 struct ofpbuf *reply;
3055 error = tc_query_class(netdev, handle, parent, &reply);
3060 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3061 ofpbuf_delete(reply);
3066 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3067 struct hfsc_class *class)
3070 const char *max_rate_s;
3072 max_rate_s = shash_find_data(details, "max-rate");
3073 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3078 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3079 max_rate = netdev_features_to_bps(current) / 8;
3082 class->min_rate = max_rate;
3083 class->max_rate = max_rate;
3087 hfsc_parse_class_details__(struct netdev *netdev,
3088 const struct shash *details,
3089 struct hfsc_class * class)
3091 const struct hfsc *hfsc;
3092 uint32_t min_rate, max_rate;
3093 const char *min_rate_s, *max_rate_s;
3095 hfsc = hfsc_get__(netdev);
3096 min_rate_s = shash_find_data(details, "min-rate");
3097 max_rate_s = shash_find_data(details, "max-rate");
3099 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3100 min_rate = MAX(min_rate, 1);
3101 min_rate = MIN(min_rate, hfsc->max_rate);
3103 max_rate = (max_rate_s
3104 ? strtoull(max_rate_s, NULL, 10) / 8
3106 max_rate = MAX(max_rate, min_rate);
3107 max_rate = MIN(max_rate, hfsc->max_rate);
3109 class->min_rate = min_rate;
3110 class->max_rate = max_rate;
3115 /* Create an HFSC qdisc.
3117 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3119 hfsc_setup_qdisc__(struct netdev * netdev)
3121 struct tcmsg *tcmsg;
3122 struct ofpbuf request;
3123 struct tc_hfsc_qopt opt;
3125 tc_del_qdisc(netdev);
3127 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3128 NLM_F_EXCL | NLM_F_CREATE, &request);
3134 tcmsg->tcm_handle = tc_make_handle(1, 0);
3135 tcmsg->tcm_parent = TC_H_ROOT;
3137 memset(&opt, 0, sizeof opt);
3140 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3141 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3143 return tc_transact(&request, NULL);
3146 /* Create an HFSC class.
3148 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3149 * sc rate <min_rate> ul rate <max_rate>" */
3151 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3152 unsigned int parent, struct hfsc_class *class)
3156 struct tcmsg *tcmsg;
3157 struct ofpbuf request;
3158 struct tc_service_curve min, max;
3160 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3166 tcmsg->tcm_handle = handle;
3167 tcmsg->tcm_parent = parent;
3171 min.m2 = class->min_rate;
3175 max.m2 = class->max_rate;
3177 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3178 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3179 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3180 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3181 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3182 nl_msg_end_nested(&request, opt_offset);
3184 error = tc_transact(&request, NULL);
3186 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3187 "min-rate %ubps, max-rate %ubps (%s)",
3188 netdev_get_name(netdev),
3189 tc_get_major(handle), tc_get_minor(handle),
3190 tc_get_major(parent), tc_get_minor(parent),
3191 class->min_rate, class->max_rate, strerror(error));
3198 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3201 struct hfsc_class class;
3203 error = hfsc_setup_qdisc__(netdev);
3209 hfsc_parse_qdisc_details__(netdev, details, &class);
3210 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3211 tc_make_handle(1, 0), &class);
3217 hfsc_install__(netdev, class.max_rate);
3222 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3225 struct nl_dump dump;
3226 struct hfsc_class hc;
3229 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3230 hfsc_install__(netdev, hc.max_rate);
3232 if (!start_queue_dump(netdev, &dump)) {
3236 while (nl_dump_next(&dump, &msg)) {
3237 unsigned int queue_id;
3239 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3240 hfsc_update_queue__(netdev, queue_id, &hc);
3244 nl_dump_done(&dump);
3249 hfsc_tc_destroy(struct tc *tc)
3252 struct hfsc_class *hc, *next;
3254 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3256 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3257 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3266 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3268 const struct hfsc *hfsc;
3269 hfsc = hfsc_get__(netdev);
3270 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3275 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3278 struct hfsc_class class;
3280 hfsc_parse_qdisc_details__(netdev, details, &class);
3281 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3282 tc_make_handle(1, 0), &class);
3285 hfsc_get__(netdev)->max_rate = class.max_rate;
3292 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3293 const struct tc_queue *queue, struct shash *details)
3295 const struct hfsc_class *hc;
3297 hc = hfsc_class_cast__(queue);
3298 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3299 if (hc->min_rate != hc->max_rate) {
3300 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3306 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3307 const struct shash *details)
3310 struct hfsc_class class;
3312 error = hfsc_parse_class_details__(netdev, details, &class);
3317 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3318 tc_make_handle(1, 0xfffe), &class);
3323 hfsc_update_queue__(netdev, queue_id, &class);
3328 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3332 struct hfsc_class *hc;
3334 hc = hfsc_class_cast__(queue);
3335 hfsc = hfsc_get__(netdev);
3337 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3339 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3346 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3347 struct netdev_queue_stats *stats)
3349 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3350 tc_make_handle(1, 0xfffe), NULL, stats);
3354 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3355 const struct ofpbuf *nlmsg,
3356 netdev_dump_queue_stats_cb *cb, void *aux)
3358 struct netdev_queue_stats stats;
3359 unsigned int handle, major, minor;
3362 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3367 major = tc_get_major(handle);
3368 minor = tc_get_minor(handle);
3369 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3370 (*cb)(minor - 1, &stats, aux);
3375 static const struct tc_ops tc_ops_hfsc = {
3376 "hfsc", /* linux_name */
3377 "linux-hfsc", /* ovs_name */
3378 HFSC_N_QUEUES, /* n_queues */
3379 hfsc_tc_install, /* tc_install */
3380 hfsc_tc_load, /* tc_load */
3381 hfsc_tc_destroy, /* tc_destroy */
3382 hfsc_qdisc_get, /* qdisc_get */
3383 hfsc_qdisc_set, /* qdisc_set */
3384 hfsc_class_get, /* class_get */
3385 hfsc_class_set, /* class_set */
3386 hfsc_class_delete, /* class_delete */
3387 hfsc_class_get_stats, /* class_get_stats */
3388 hfsc_class_dump_stats /* class_dump_stats */
3391 /* "linux-default" traffic control class.
3393 * This class represents the default, unnamed Linux qdisc. It corresponds to
3394 * the "" (empty string) QoS type in the OVS database. */
3397 default_install__(struct netdev *netdev)
3399 struct netdev_dev_linux *netdev_dev =
3400 netdev_dev_linux_cast(netdev_get_dev(netdev));
3401 static struct tc *tc;
3404 tc = xmalloc(sizeof *tc);
3405 tc_init(tc, &tc_ops_default);
3407 netdev_dev->tc = tc;
3411 default_tc_install(struct netdev *netdev,
3412 const struct shash *details OVS_UNUSED)
3414 default_install__(netdev);
3419 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3421 default_install__(netdev);
3425 static const struct tc_ops tc_ops_default = {
3426 NULL, /* linux_name */
3431 NULL, /* tc_destroy */
3432 NULL, /* qdisc_get */
3433 NULL, /* qdisc_set */
3434 NULL, /* class_get */
3435 NULL, /* class_set */
3436 NULL, /* class_delete */
3437 NULL, /* class_get_stats */
3438 NULL /* class_dump_stats */
3441 /* "linux-other" traffic control class.
3446 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3448 struct netdev_dev_linux *netdev_dev =
3449 netdev_dev_linux_cast(netdev_get_dev(netdev));
3450 static struct tc *tc;
3453 tc = xmalloc(sizeof *tc);
3454 tc_init(tc, &tc_ops_other);
3456 netdev_dev->tc = tc;
3460 static const struct tc_ops tc_ops_other = {
3461 NULL, /* linux_name */
3462 "linux-other", /* ovs_name */
3464 NULL, /* tc_install */
3466 NULL, /* tc_destroy */
3467 NULL, /* qdisc_get */
3468 NULL, /* qdisc_set */
3469 NULL, /* class_get */
3470 NULL, /* class_set */
3471 NULL, /* class_delete */
3472 NULL, /* class_get_stats */
3473 NULL /* class_dump_stats */
3476 /* Traffic control. */
3478 /* Number of kernel "tc" ticks per second. */
3479 static double ticks_per_s;
3481 /* Number of kernel "jiffies" per second. This is used for the purpose of
3482 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3483 * one jiffy's worth of data.
3485 * There are two possibilities here:
3487 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3488 * approximate range of 100 to 1024. That means that we really need to
3489 * make sure that the qdisc can buffer that much data.
3491 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3492 * has finely granular timers and there's no need to fudge additional room
3493 * for buffers. (There's no extra effort needed to implement that: the
3494 * large 'buffer_hz' is used as a divisor, so practically any number will
3495 * come out as 0 in the division. Small integer results in the case of
3496 * really high dividends won't have any real effect anyhow.)
3498 static unsigned int buffer_hz;
3500 /* Returns tc handle 'major':'minor'. */
3502 tc_make_handle(unsigned int major, unsigned int minor)
3504 return TC_H_MAKE(major << 16, minor);
3507 /* Returns the major number from 'handle'. */
3509 tc_get_major(unsigned int handle)
3511 return TC_H_MAJ(handle) >> 16;
3514 /* Returns the minor number from 'handle'. */
3516 tc_get_minor(unsigned int handle)
3518 return TC_H_MIN(handle);
3521 static struct tcmsg *
3522 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3523 struct ofpbuf *request)
3525 struct tcmsg *tcmsg;
3529 error = get_ifindex(netdev, &ifindex);
3534 ofpbuf_init(request, 512);
3535 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3536 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3537 tcmsg->tcm_family = AF_UNSPEC;
3538 tcmsg->tcm_ifindex = ifindex;
3539 /* Caller should fill in tcmsg->tcm_handle. */
3540 /* Caller should fill in tcmsg->tcm_parent. */
3546 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3548 int error = nl_sock_transact(rtnl_sock, request, replyp);
3549 ofpbuf_uninit(request);
3556 /* The values in psched are not individually very meaningful, but they are
3557 * important. The tables below show some values seen in the wild.
3561 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3562 * (Before that, there are hints that it was 1000000000.)
3564 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3568 * -----------------------------------
3569 * [1] 000c8000 000f4240 000f4240 00000064
3570 * [2] 000003e8 00000400 000f4240 3b9aca00
3571 * [3] 000003e8 00000400 000f4240 3b9aca00
3572 * [4] 000003e8 00000400 000f4240 00000064
3573 * [5] 000003e8 00000040 000f4240 3b9aca00
3574 * [6] 000003e8 00000040 000f4240 000000f9
3576 * a b c d ticks_per_s buffer_hz
3577 * ------- --------- ---------- ------------- ----------- -------------
3578 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3579 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3580 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3581 * [4] 1,000 1,024 1,000,000 100 976,562 100
3582 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3583 * [6] 1,000 64 1,000,000 249 15,625,000 249
3585 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3586 * [2] 2.6.26-1-686-bigmem from Debian lenny
3587 * [3] 2.6.26-2-sparc64 from Debian lenny
3588 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3589 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3590 * [6] 2.6.34 from kernel.org on KVM
3592 static const char fn[] = "/proc/net/psched";
3593 unsigned int a, b, c, d;
3599 stream = fopen(fn, "r");
3601 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3605 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3606 VLOG_WARN("%s: read failed", fn);
3610 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3614 VLOG_WARN("%s: invalid scheduler parameters", fn);
3618 ticks_per_s = (double) a * c / b;
3622 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3625 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3628 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3629 * rate of 'rate' bytes per second. */
3631 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3636 return (rate * ticks) / ticks_per_s;
3639 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3640 * rate of 'rate' bytes per second. */
3642 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3647 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3650 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3651 * a transmission rate of 'rate' bytes per second. */
3653 tc_buffer_per_jiffy(unsigned int rate)
3658 return rate / buffer_hz;
3661 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3662 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3663 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3664 * stores NULL into it if it is absent.
3666 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3669 * Returns 0 if successful, otherwise a positive errno value. */
3671 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3672 struct nlattr **options)
3674 static const struct nl_policy tca_policy[] = {
3675 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3676 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3678 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3680 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3681 tca_policy, ta, ARRAY_SIZE(ta))) {
3682 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3687 *kind = nl_attr_get_string(ta[TCA_KIND]);
3691 *options = ta[TCA_OPTIONS];
3706 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3707 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3708 * into '*options', and its queue statistics into '*stats'. Any of the output
3709 * arguments may be null.
3711 * Returns 0 if successful, otherwise a positive errno value. */
3713 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3714 struct nlattr **options, struct netdev_queue_stats *stats)
3716 static const struct nl_policy tca_policy[] = {
3717 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3718 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3720 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3722 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3723 tca_policy, ta, ARRAY_SIZE(ta))) {
3724 VLOG_WARN_RL(&rl, "failed to parse class message");
3729 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3730 *handlep = tc->tcm_handle;
3734 *options = ta[TCA_OPTIONS];
3738 const struct gnet_stats_queue *gsq;
3739 struct gnet_stats_basic gsb;
3741 static const struct nl_policy stats_policy[] = {
3742 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3743 .min_len = sizeof gsb },
3744 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3745 .min_len = sizeof *gsq },
3747 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3749 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3750 sa, ARRAY_SIZE(sa))) {
3751 VLOG_WARN_RL(&rl, "failed to parse class stats");
3755 /* Alignment issues screw up the length of struct gnet_stats_basic on
3756 * some arch/bitsize combinations. Newer versions of Linux have a
3757 * struct gnet_stats_basic_packed, but we can't depend on that. The
3758 * easiest thing to do is just to make a copy. */
3759 memset(&gsb, 0, sizeof gsb);
3760 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3761 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3762 stats->tx_bytes = gsb.bytes;
3763 stats->tx_packets = gsb.packets;
3765 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3766 stats->tx_errors = gsq->drops;
3776 memset(stats, 0, sizeof *stats);
3781 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3784 tc_query_class(const struct netdev *netdev,
3785 unsigned int handle, unsigned int parent,
3786 struct ofpbuf **replyp)
3788 struct ofpbuf request;
3789 struct tcmsg *tcmsg;
3792 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3796 tcmsg->tcm_handle = handle;
3797 tcmsg->tcm_parent = parent;
3799 error = tc_transact(&request, replyp);
3801 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3802 netdev_get_name(netdev),
3803 tc_get_major(handle), tc_get_minor(handle),
3804 tc_get_major(parent), tc_get_minor(parent),
3810 /* Equivalent to "tc class del dev <name> handle <handle>". */
3812 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3814 struct ofpbuf request;
3815 struct tcmsg *tcmsg;
3818 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3822 tcmsg->tcm_handle = handle;
3823 tcmsg->tcm_parent = 0;
3825 error = tc_transact(&request, NULL);
3827 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3828 netdev_get_name(netdev),
3829 tc_get_major(handle), tc_get_minor(handle),
3835 /* Equivalent to "tc qdisc del dev <name> root". */
3837 tc_del_qdisc(struct netdev *netdev)
3839 struct netdev_dev_linux *netdev_dev =
3840 netdev_dev_linux_cast(netdev_get_dev(netdev));
3841 struct ofpbuf request;
3842 struct tcmsg *tcmsg;
3845 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3849 tcmsg->tcm_handle = tc_make_handle(1, 0);
3850 tcmsg->tcm_parent = TC_H_ROOT;
3852 error = tc_transact(&request, NULL);
3853 if (error == EINVAL) {
3854 /* EINVAL probably means that the default qdisc was in use, in which
3855 * case we've accomplished our purpose. */
3858 if (!error && netdev_dev->tc) {
3859 if (netdev_dev->tc->ops->tc_destroy) {
3860 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3862 netdev_dev->tc = NULL;
3867 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3868 * kernel to determine what they are. Returns 0 if successful, otherwise a
3869 * positive errno value. */
3871 tc_query_qdisc(const struct netdev *netdev)
3873 struct netdev_dev_linux *netdev_dev =
3874 netdev_dev_linux_cast(netdev_get_dev(netdev));
3875 struct ofpbuf request, *qdisc;
3876 const struct tc_ops *ops;
3877 struct tcmsg *tcmsg;
3881 if (netdev_dev->tc) {
3885 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3886 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3887 * 2.6.35 without that fix backported to it.
3889 * To avoid the OOPS, we must not make a request that would attempt to dump
3890 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3891 * few others. There are a few ways that I can see to do this, but most of
3892 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3893 * technique chosen here is to assume that any non-default qdisc that we
3894 * create will have a class with handle 1:0. The built-in qdiscs only have
3895 * a class with handle 0:0.
3897 * We could check for Linux 2.6.35+ and use a more straightforward method
3899 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3903 tcmsg->tcm_handle = tc_make_handle(1, 0);
3904 tcmsg->tcm_parent = 0;
3906 /* Figure out what tc class to instantiate. */
3907 error = tc_transact(&request, &qdisc);
3911 error = tc_parse_qdisc(qdisc, &kind, NULL);
3913 ops = &tc_ops_other;
3915 ops = tc_lookup_linux_name(kind);
3917 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3918 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3920 ops = &tc_ops_other;
3923 } else if (error == ENOENT) {
3924 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3925 * other entity that doesn't have a handle 1:0. We will assume
3926 * that it's the system default qdisc. */
3927 ops = &tc_ops_default;
3930 /* Who knows? Maybe the device got deleted. */
3931 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3932 netdev_get_name(netdev), strerror(error));
3933 ops = &tc_ops_other;
3936 /* Instantiate it. */
3937 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3938 assert((load_error == 0) == (netdev_dev->tc != NULL));
3939 ofpbuf_delete(qdisc);
3941 return error ? error : load_error;
3944 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3945 approximate the time to transmit packets of various lengths. For an MTU of
3946 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3947 represents two possible packet lengths; for a MTU of 513 through 1024, four
3948 possible lengths; and so on.
3950 Returns, for the specified 'mtu', the number of bits that packet lengths
3951 need to be shifted right to fit within such a 256-entry table. */
3953 tc_calc_cell_log(unsigned int mtu)
3958 mtu = ETH_PAYLOAD_MAX;
3960 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3962 for (cell_log = 0; mtu >= 256; cell_log++) {
3969 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3972 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3974 memset(rate, 0, sizeof *rate);
3975 rate->cell_log = tc_calc_cell_log(mtu);
3976 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3977 /* rate->cell_align = 0; */ /* distro headers. */
3978 rate->mpu = ETH_TOTAL_MIN;
3982 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3983 * attribute of the specified "type".
3985 * See tc_calc_cell_log() above for a description of "rtab"s. */
3987 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3992 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3993 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3994 unsigned packet_size = (i + 1) << rate->cell_log;
3995 if (packet_size < rate->mpu) {
3996 packet_size = rate->mpu;
3998 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4002 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4003 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4004 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4007 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4009 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4010 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4013 /* Copies 'src' into 'dst', performing format conversion in the process. */
4015 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4016 const struct rtnl_link_stats *src)
4018 dst->rx_packets = src->rx_packets;
4019 dst->tx_packets = src->tx_packets;
4020 dst->rx_bytes = src->rx_bytes;
4021 dst->tx_bytes = src->tx_bytes;
4022 dst->rx_errors = src->rx_errors;
4023 dst->tx_errors = src->tx_errors;
4024 dst->rx_dropped = src->rx_dropped;
4025 dst->tx_dropped = src->tx_dropped;
4026 dst->multicast = src->multicast;
4027 dst->collisions = src->collisions;
4028 dst->rx_length_errors = src->rx_length_errors;
4029 dst->rx_over_errors = src->rx_over_errors;
4030 dst->rx_crc_errors = src->rx_crc_errors;
4031 dst->rx_frame_errors = src->rx_frame_errors;
4032 dst->rx_fifo_errors = src->rx_fifo_errors;
4033 dst->rx_missed_errors = src->rx_missed_errors;
4034 dst->tx_aborted_errors = src->tx_aborted_errors;
4035 dst->tx_carrier_errors = src->tx_carrier_errors;
4036 dst->tx_fifo_errors = src->tx_fifo_errors;
4037 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4038 dst->tx_window_errors = src->tx_window_errors;
4042 /* Utility functions. */
4045 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4047 /* Policy for RTNLGRP_LINK messages.
4049 * There are *many* more fields in these messages, but currently we only
4050 * care about these fields. */
4051 static const struct nl_policy rtnlgrp_link_policy[] = {
4052 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4053 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4054 .min_len = sizeof(struct rtnl_link_stats) },
4057 struct ofpbuf request;
4058 struct ofpbuf *reply;
4059 struct ifinfomsg *ifi;
4060 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4063 ofpbuf_init(&request, 0);
4064 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4065 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4066 ifi->ifi_family = PF_UNSPEC;
4067 ifi->ifi_index = ifindex;
4068 error = nl_sock_transact(rtnl_sock, &request, &reply);
4069 ofpbuf_uninit(&request);
4074 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4075 rtnlgrp_link_policy,
4076 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4077 ofpbuf_delete(reply);
4081 if (!attrs[IFLA_STATS]) {
4082 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4083 ofpbuf_delete(reply);
4087 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4089 ofpbuf_delete(reply);
4095 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4097 static const char fn[] = "/proc/net/dev";
4102 stream = fopen(fn, "r");
4104 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4109 while (fgets(line, sizeof line, stream)) {
4112 #define X64 "%"SCNu64
4115 X64 X64 X64 X64 X64 X64 X64 "%*u"
4116 X64 X64 X64 X64 X64 X64 X64 "%*u",
4122 &stats->rx_fifo_errors,
4123 &stats->rx_frame_errors,
4129 &stats->tx_fifo_errors,
4131 &stats->tx_carrier_errors) != 15) {
4132 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4133 } else if (!strcmp(devname, netdev_name)) {
4134 stats->rx_length_errors = UINT64_MAX;
4135 stats->rx_over_errors = UINT64_MAX;
4136 stats->rx_crc_errors = UINT64_MAX;
4137 stats->rx_missed_errors = UINT64_MAX;
4138 stats->tx_aborted_errors = UINT64_MAX;
4139 stats->tx_heartbeat_errors = UINT64_MAX;
4140 stats->tx_window_errors = UINT64_MAX;
4146 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4152 get_flags(const struct netdev *netdev, int *flags)
4157 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4159 *flags = ifr.ifr_flags;
4164 set_flags(struct netdev *netdev, int flags)
4168 ifr.ifr_flags = flags;
4169 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4174 do_get_ifindex(const char *netdev_name)
4178 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4179 COVERAGE_INC(netdev_get_ifindex);
4180 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4181 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4182 netdev_name, strerror(errno));
4185 return ifr.ifr_ifindex;
4189 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4191 struct netdev_dev_linux *netdev_dev =
4192 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4194 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4195 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4199 netdev_dev->cache_valid |= VALID_IFINDEX;
4200 netdev_dev->ifindex = ifindex;
4202 *ifindexp = netdev_dev->ifindex;
4207 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4212 memset(&ifr, 0, sizeof ifr);
4213 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4214 COVERAGE_INC(netdev_get_hwaddr);
4215 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4216 /* ENODEV probably means that a vif disappeared asynchronously and
4217 * hasn't been removed from the database yet, so reduce the log level
4218 * to INFO for that case. */
4219 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4220 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4221 netdev_name, strerror(errno));
4224 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4225 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4226 VLOG_WARN("%s device has unknown hardware address family %d",
4227 netdev_name, hwaddr_family);
4229 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4234 set_etheraddr(const char *netdev_name, int hwaddr_family,
4235 const uint8_t mac[ETH_ADDR_LEN])
4239 memset(&ifr, 0, sizeof ifr);
4240 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4241 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4242 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4243 COVERAGE_INC(netdev_set_hwaddr);
4244 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4245 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4246 netdev_name, strerror(errno));
4253 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4254 int cmd, const char *cmd_name)
4258 memset(&ifr, 0, sizeof ifr);
4259 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4260 ifr.ifr_data = (caddr_t) ecmd;
4263 COVERAGE_INC(netdev_ethtool);
4264 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4267 if (errno != EOPNOTSUPP) {
4268 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4269 "failed: %s", cmd_name, name, strerror(errno));
4271 /* The device doesn't support this operation. That's pretty
4272 * common, so there's no point in logging anything. */
4278 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4279 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4281 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4282 const char *flag_name, bool enable)
4284 const char *netdev_name = netdev_get_name(netdev);
4285 struct ethtool_value evalue;
4289 memset(&evalue, 0, sizeof evalue);
4290 error = netdev_linux_do_ethtool(netdev_name,
4291 (struct ethtool_cmd *)&evalue,
4292 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4297 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4298 error = netdev_linux_do_ethtool(netdev_name,
4299 (struct ethtool_cmd *)&evalue,
4300 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4305 memset(&evalue, 0, sizeof evalue);
4306 error = netdev_linux_do_ethtool(netdev_name,
4307 (struct ethtool_cmd *)&evalue,
4308 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4313 if (new_flags != evalue.data) {
4314 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4315 "device %s failed", enable ? "enable" : "disable",
4316 flag_name, netdev_name);
4324 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4325 const char *cmd_name)
4327 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4328 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4329 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4337 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4338 int cmd, const char *cmd_name)
4343 ifr.ifr_addr.sa_family = AF_INET;
4344 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4346 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4347 *ip = sin->sin_addr;
4352 /* Returns an AF_PACKET raw socket or a negative errno value. */
4354 af_packet_sock(void)
4356 static int sock = INT_MIN;
4358 if (sock == INT_MIN) {
4359 sock = socket(AF_PACKET, SOCK_RAW, 0);
4361 set_nonblocking(sock);
4364 VLOG_ERR("failed to create packet socket: %s", strerror(errno));