2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_ethtool);
82 /* These were introduced in Linux 2.6.14, so they might be missing if we have
84 #ifndef ADVERTISED_Pause
85 #define ADVERTISED_Pause (1 << 13)
87 #ifndef ADVERTISED_Asym_Pause
88 #define ADVERTISED_Asym_Pause (1 << 14)
91 /* These were introduced in Linux 2.6.24, so they might be missing if we
92 * have old headers. */
93 #ifndef ETHTOOL_GFLAGS
94 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96 #ifndef ETHTOOL_SFLAGS
97 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 #define TC_RTAB_SIZE 1024
106 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
107 static int cache_notifier_refcount;
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_HAVE_VPORT_STATS = 1 << 6
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 /* One traffic control queue.
140 * Each TC implementation subclasses this with whatever additional data it
143 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
144 unsigned int queue_id; /* OpenFlow queue ID. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct shash *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct shash *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct shash *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct shash *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
331 struct nlattr **options);
332 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
333 struct nlattr **options,
334 struct netdev_queue_stats *);
335 static int tc_query_class(const struct netdev *,
336 unsigned int handle, unsigned int parent,
337 struct ofpbuf **replyp);
338 static int tc_delete_class(const struct netdev *, unsigned int handle);
340 static int tc_del_qdisc(struct netdev *netdev);
341 static int tc_query_qdisc(const struct netdev *netdev);
343 static int tc_calc_cell_log(unsigned int mtu);
344 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
345 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
346 const struct tc_ratespec *rate);
347 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
349 struct netdev_dev_linux {
350 struct netdev_dev netdev_dev;
352 struct shash_node *shash_node;
353 unsigned int cache_valid;
354 unsigned int change_seq;
356 bool miimon; /* Link status of last poll. */
357 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
358 struct timer miimon_timer;
360 /* The following are figured out "on demand" only. They are only valid
361 * when the corresponding VALID_* bit in 'cache_valid' is set. */
363 uint8_t etheraddr[ETH_ADDR_LEN];
364 struct in_addr address, netmask;
368 long long int carrier_resets;
369 uint32_t kbits_rate; /* Policing data. */
370 uint32_t kbits_burst;
371 bool have_vport_stats;
375 struct tap_state tap;
379 struct netdev_linux {
380 struct netdev netdev;
384 /* Sockets used for ioctl operations. */
385 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
387 /* A Netlink routing socket that is not subscribed to any multicast groups. */
388 static struct nl_sock *rtnl_sock;
390 /* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394 static int netdev_linux_init(void);
396 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397 int cmd, const char *cmd_name);
398 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
400 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
402 static int get_flags(const struct netdev *, int *flagsp);
403 static int set_flags(struct netdev *, int flags);
404 static int do_get_ifindex(const char *netdev_name);
405 static int get_ifindex(const struct netdev *, int *ifindexp);
406 static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
414 static int get_carrier_via_sysfs(const char *name, bool *carrier);
415 static int af_packet_sock(void);
416 static void netdev_linux_miimon_run(void);
417 static void netdev_linux_miimon_wait(void);
420 is_netdev_linux_class(const struct netdev_class *netdev_class)
422 return netdev_class->init == netdev_linux_init;
425 static struct netdev_dev_linux *
426 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
428 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429 assert(is_netdev_linux_class(netdev_class));
431 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
434 static struct netdev_linux *
435 netdev_linux_cast(const struct netdev *netdev)
437 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
438 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439 assert(is_netdev_linux_class(netdev_class));
441 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
445 netdev_linux_init(void)
447 static int status = -1;
449 /* Create AF_INET socket. */
450 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
451 status = af_inet_sock >= 0 ? 0 : errno;
453 VLOG_ERR("failed to create inet socket: %s", strerror(status));
456 /* Create rtnetlink socket. */
458 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
460 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
469 netdev_linux_run(void)
471 rtnetlink_link_run();
472 netdev_linux_miimon_run();
476 netdev_linux_wait(void)
478 rtnetlink_link_wait();
479 netdev_linux_miimon_wait();
483 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
486 if (!dev->change_seq) {
489 dev->cache_valid = 0;
493 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
494 void *aux OVS_UNUSED)
496 struct netdev_dev_linux *dev;
498 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
500 const struct netdev_class *netdev_class =
501 netdev_dev_get_class(base_dev);
503 if (is_netdev_linux_class(netdev_class)) {
504 dev = netdev_dev_linux_cast(base_dev);
506 if (dev->carrier != change->running) {
507 dev->carrier = change->running;
508 dev->carrier_resets++;
511 netdev_dev_linux_changed(dev);
515 struct shash device_shash;
516 struct shash_node *node;
518 shash_init(&device_shash);
519 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
520 SHASH_FOR_EACH (node, &device_shash) {
525 get_carrier_via_sysfs(node->name, &carrier);
526 if (dev->carrier != carrier) {
527 dev->carrier = carrier;
528 dev->carrier_resets++;
531 netdev_dev_linux_changed(dev);
533 shash_destroy(&device_shash);
537 /* Creates system and internal devices. */
539 netdev_linux_create(const struct netdev_class *class, const char *name,
540 struct netdev_dev **netdev_devp)
542 struct netdev_dev_linux *netdev_dev;
544 if (!cache_notifier_refcount) {
545 assert(!netdev_linux_cache_notifier);
547 netdev_linux_cache_notifier =
548 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
550 if (!netdev_linux_cache_notifier) {
554 cache_notifier_refcount++;
556 netdev_dev = xzalloc(sizeof *netdev_dev);
557 netdev_dev->change_seq = 1;
558 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
559 get_carrier_via_sysfs(name, &netdev_dev->carrier);
561 *netdev_devp = &netdev_dev->netdev_dev;
565 /* For most types of netdevs we open the device for each call of
566 * netdev_open(). However, this is not the case with tap devices,
567 * since it is only possible to open the device once. In this
568 * situation we share a single file descriptor, and consequently
569 * buffers, across all readers. Therefore once data is read it will
570 * be unavailable to other reads for tap devices. */
572 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
573 const char *name, struct netdev_dev **netdev_devp)
575 struct netdev_dev_linux *netdev_dev;
576 struct tap_state *state;
577 static const char tap_dev[] = "/dev/net/tun";
581 netdev_dev = xzalloc(sizeof *netdev_dev);
582 state = &netdev_dev->state.tap;
584 /* Open tap device. */
585 state->fd = open(tap_dev, O_RDWR);
588 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
592 /* Create tap device. */
593 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
594 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
595 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
596 VLOG_WARN("%s: creating tap device failed: %s", name,
602 /* Make non-blocking. */
603 error = set_nonblocking(state->fd);
608 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
609 *netdev_devp = &netdev_dev->netdev_dev;
618 destroy_tap(struct netdev_dev_linux *netdev_dev)
620 struct tap_state *state = &netdev_dev->state.tap;
622 if (state->fd >= 0) {
627 /* Destroys the netdev device 'netdev_dev_'. */
629 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
631 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
632 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
634 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
635 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
638 if (class == &netdev_linux_class || class == &netdev_internal_class) {
639 cache_notifier_refcount--;
641 if (!cache_notifier_refcount) {
642 assert(netdev_linux_cache_notifier);
643 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
644 netdev_linux_cache_notifier = NULL;
646 } else if (class == &netdev_tap_class) {
647 destroy_tap(netdev_dev);
656 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
658 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
659 struct netdev_linux *netdev;
660 enum netdev_flags flags;
663 /* Allocate network device. */
664 netdev = xzalloc(sizeof *netdev);
666 netdev_init(&netdev->netdev, netdev_dev_);
668 /* Verify that the device really exists, by attempting to read its flags.
669 * (The flags might be cached, in which case this won't actually do an
672 * Don't do this for "internal" netdevs, though, because those have to be
673 * created as netdev objects before they exist in the kernel, because
674 * creating them in the kernel happens by passing a netdev object to
675 * dpif_port_add(). */
676 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
677 error = netdev_get_flags(&netdev->netdev, &flags);
678 if (error == ENODEV) {
683 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
684 !netdev_dev->state.tap.opened) {
686 /* We assume that the first user of the tap device is the primary user
687 * and give them the tap FD. Subsequent users probably just expect
688 * this to be a system device so open it normally to avoid send/receive
689 * directions appearing to be reversed. */
690 netdev->fd = netdev_dev->state.tap.fd;
691 netdev_dev->state.tap.opened = true;
694 *netdevp = &netdev->netdev;
698 netdev_uninit(&netdev->netdev, true);
702 /* Closes and destroys 'netdev'. */
704 netdev_linux_close(struct netdev *netdev_)
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
708 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
715 netdev_linux_listen(struct netdev *netdev_)
717 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
718 struct sockaddr_ll sll;
723 if (netdev->fd >= 0) {
727 /* Create file descriptor. */
728 fd = socket(PF_PACKET, SOCK_RAW, 0);
731 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
735 /* Set non-blocking mode. */
736 error = set_nonblocking(fd);
741 /* Get ethernet device index. */
742 error = get_ifindex(&netdev->netdev, &ifindex);
747 /* Bind to specific ethernet device. */
748 memset(&sll, 0, sizeof sll);
749 sll.sll_family = AF_PACKET;
750 sll.sll_ifindex = ifindex;
751 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
752 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
754 VLOG_ERR("%s: failed to bind raw socket (%s)",
755 netdev_get_name(netdev_), strerror(error));
770 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
772 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 if (netdev->fd < 0) {
775 /* Device is not listening. */
780 ssize_t retval = read(netdev->fd, data, size);
783 } else if (errno != EINTR) {
784 if (errno != EAGAIN) {
785 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
786 strerror(errno), netdev_get_name(netdev_));
793 /* Registers with the poll loop to wake up from the next call to poll_block()
794 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
796 netdev_linux_recv_wait(struct netdev *netdev_)
798 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799 if (netdev->fd >= 0) {
800 poll_fd_wait(netdev->fd, POLLIN);
804 /* Discards all packets waiting to be received from 'netdev'. */
806 netdev_linux_drain(struct netdev *netdev_)
808 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
809 if (netdev->fd < 0) {
811 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
813 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
814 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
818 drain_fd(netdev->fd, ifr.ifr_qlen);
821 return drain_rcvbuf(netdev->fd);
825 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
826 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
827 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
828 * the packet is too big or too small to transmit on the device.
830 * The caller retains ownership of 'buffer' in all cases.
832 * The kernel maintains a packet transmission queue, so the caller is not
833 * expected to do additional queuing of packets. */
835 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
841 if (netdev->fd < 0) {
842 /* Use our AF_PACKET socket to send to this device. */
843 struct sockaddr_ll sll;
850 sock = af_packet_sock();
855 error = get_ifindex(netdev_, &ifindex);
860 /* We don't bother setting most fields in sockaddr_ll because the
861 * kernel ignores them for SOCK_RAW. */
862 memset(&sll, 0, sizeof sll);
863 sll.sll_family = AF_PACKET;
864 sll.sll_ifindex = ifindex;
866 iov.iov_base = (void *) data;
870 msg.msg_namelen = sizeof sll;
873 msg.msg_control = NULL;
874 msg.msg_controllen = 0;
877 retval = sendmsg(sock, &msg, 0);
879 /* Use the netdev's own fd to send to this device. This is
880 * essential for tap devices, because packets sent to a tap device
881 * with an AF_PACKET socket will loop back to be *received* again
882 * on the tap device. */
883 retval = write(netdev->fd, data, size);
887 /* The Linux AF_PACKET implementation never blocks waiting for room
888 * for packets, instead returning ENOBUFS. Translate this into
889 * EAGAIN for the caller. */
890 if (errno == ENOBUFS) {
892 } else if (errno == EINTR) {
894 } else if (errno != EAGAIN) {
895 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
896 netdev_get_name(netdev_), strerror(errno));
899 } else if (retval != size) {
900 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
901 "%zu) on %s", retval, size, netdev_get_name(netdev_));
909 /* Registers with the poll loop to wake up from the next call to poll_block()
910 * when the packet transmission queue has sufficient room to transmit a packet
911 * with netdev_send().
913 * The kernel maintains a packet transmission queue, so the client is not
914 * expected to do additional queuing of packets. Thus, this function is
915 * unlikely to ever be used. It is included for completeness. */
917 netdev_linux_send_wait(struct netdev *netdev_)
919 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
920 if (netdev->fd < 0) {
922 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
923 poll_fd_wait(netdev->fd, POLLOUT);
925 /* TAP device always accepts packets.*/
926 poll_immediate_wake();
930 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
931 * otherwise a positive errno value. */
933 netdev_linux_set_etheraddr(struct netdev *netdev_,
934 const uint8_t mac[ETH_ADDR_LEN])
936 struct netdev_dev_linux *netdev_dev =
937 netdev_dev_linux_cast(netdev_get_dev(netdev_));
940 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
941 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
942 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
944 netdev_dev->cache_valid |= VALID_ETHERADDR;
945 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
953 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
954 * free the returned buffer. */
956 netdev_linux_get_etheraddr(const struct netdev *netdev_,
957 uint8_t mac[ETH_ADDR_LEN])
959 struct netdev_dev_linux *netdev_dev =
960 netdev_dev_linux_cast(netdev_get_dev(netdev_));
961 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
962 int error = get_etheraddr(netdev_get_name(netdev_),
963 netdev_dev->etheraddr);
967 netdev_dev->cache_valid |= VALID_ETHERADDR;
969 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
973 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
974 * in bytes, not including the hardware header; thus, this is typically 1500
975 * bytes for Ethernet devices. */
977 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
979 struct netdev_dev_linux *netdev_dev =
980 netdev_dev_linux_cast(netdev_get_dev(netdev_));
981 if (!(netdev_dev->cache_valid & VALID_MTU)) {
985 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
986 SIOCGIFMTU, "SIOCGIFMTU");
990 netdev_dev->mtu = ifr.ifr_mtu;
991 netdev_dev->cache_valid |= VALID_MTU;
993 *mtup = netdev_dev->mtu;
997 /* Sets the maximum size of transmitted (MTU) for given device using linux
998 * networking ioctl interface.
1001 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1003 struct netdev_dev_linux *netdev_dev =
1004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1009 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1010 SIOCSIFMTU, "SIOCSIFMTU");
1015 netdev_dev->mtu = ifr.ifr_mtu;
1016 netdev_dev->cache_valid |= VALID_MTU;
1020 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1021 * On failure, returns a negative errno value. */
1023 netdev_linux_get_ifindex(const struct netdev *netdev)
1027 error = get_ifindex(netdev, &ifindex);
1028 return error ? -error : ifindex;
1032 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1034 struct netdev_dev_linux *netdev_dev =
1035 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1037 if (netdev_dev->miimon_interval > 0) {
1038 *carrier = netdev_dev->miimon;
1040 *carrier = netdev_dev->carrier;
1046 static long long int
1047 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1049 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1053 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1054 struct mii_ioctl_data *data)
1059 memset(&ifr, 0, sizeof ifr);
1060 memcpy(&ifr.ifr_data, data, sizeof *data);
1061 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1062 memcpy(data, &ifr.ifr_data, sizeof *data);
1068 netdev_linux_get_miimon(const char *name, bool *miimon)
1070 struct mii_ioctl_data data;
1075 memset(&data, 0, sizeof data);
1076 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1078 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1079 data.reg_num = MII_BMSR;
1080 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1084 *miimon = !!(data.val_out & BMSR_LSTATUS);
1086 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1089 struct ethtool_cmd ecmd;
1091 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1094 memset(&ecmd, 0, sizeof ecmd);
1095 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1098 struct ethtool_value eval;
1100 memcpy(&eval, &ecmd, sizeof eval);
1101 *miimon = !!eval.data;
1103 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1111 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1112 long long int interval)
1114 struct netdev_dev_linux *netdev_dev;
1116 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1118 interval = interval > 0 ? MAX(interval, 100) : 0;
1119 if (netdev_dev->miimon_interval != interval) {
1120 netdev_dev->miimon_interval = interval;
1121 timer_set_expired(&netdev_dev->miimon_timer);
1128 netdev_linux_miimon_run(void)
1130 struct shash device_shash;
1131 struct shash_node *node;
1133 shash_init(&device_shash);
1134 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1135 SHASH_FOR_EACH (node, &device_shash) {
1136 struct netdev_dev_linux *dev = node->data;
1139 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1143 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1144 if (miimon != dev->miimon) {
1145 dev->miimon = miimon;
1146 netdev_dev_linux_changed(dev);
1149 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1152 shash_destroy(&device_shash);
1156 netdev_linux_miimon_wait(void)
1158 struct shash device_shash;
1159 struct shash_node *node;
1161 shash_init(&device_shash);
1162 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1163 SHASH_FOR_EACH (node, &device_shash) {
1164 struct netdev_dev_linux *dev = node->data;
1166 if (dev->miimon_interval > 0) {
1167 timer_wait(&dev->miimon_timer);
1170 shash_destroy(&device_shash);
1173 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1174 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1177 check_for_working_netlink_stats(void)
1179 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1180 * preferable, so if that works, we'll use it. */
1181 int ifindex = do_get_ifindex("lo");
1183 VLOG_WARN("failed to get ifindex for lo, "
1184 "obtaining netdev stats from proc");
1187 struct netdev_stats stats;
1188 int error = get_stats_via_netlink(ifindex, &stats);
1190 VLOG_DBG("obtaining netdev stats via rtnetlink");
1193 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1194 "via proc (you are probably running a pre-2.6.19 "
1195 "kernel)", strerror(error));
1202 swap_uint64(uint64_t *a, uint64_t *b)
1210 get_stats_via_vport(const struct netdev *netdev_,
1211 struct netdev_stats *stats)
1213 struct netdev_dev_linux *netdev_dev =
1214 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1216 if (netdev_dev->have_vport_stats ||
1217 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1220 error = netdev_vport_get_stats(netdev_, stats);
1222 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1223 netdev_get_name(netdev_), error);
1225 netdev_dev->have_vport_stats = !error;
1226 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1231 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1232 struct netdev_stats *stats)
1234 static int use_netlink_stats = -1;
1237 if (use_netlink_stats < 0) {
1238 use_netlink_stats = check_for_working_netlink_stats();
1241 if (use_netlink_stats) {
1244 error = get_ifindex(netdev_, &ifindex);
1246 error = get_stats_via_netlink(ifindex, stats);
1249 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1253 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1254 netdev_get_name(netdev_), error);
1260 /* Retrieves current device stats for 'netdev-linux'. */
1262 netdev_linux_get_stats(const struct netdev *netdev_,
1263 struct netdev_stats *stats)
1265 struct netdev_dev_linux *netdev_dev =
1266 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1267 struct netdev_stats dev_stats;
1270 get_stats_via_vport(netdev_, stats);
1272 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1275 if (!netdev_dev->have_vport_stats) {
1282 if (!netdev_dev->have_vport_stats) {
1283 /* stats not available from OVS then use ioctl stats. */
1286 stats->rx_errors += dev_stats.rx_errors;
1287 stats->tx_errors += dev_stats.tx_errors;
1288 stats->rx_dropped += dev_stats.rx_dropped;
1289 stats->tx_dropped += dev_stats.tx_dropped;
1290 stats->multicast += dev_stats.multicast;
1291 stats->collisions += dev_stats.collisions;
1292 stats->rx_length_errors += dev_stats.rx_length_errors;
1293 stats->rx_over_errors += dev_stats.rx_over_errors;
1294 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1295 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1296 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1297 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1298 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1299 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1300 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1301 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1302 stats->tx_window_errors += dev_stats.tx_window_errors;
1307 /* Retrieves current device stats for 'netdev-tap' netdev or
1308 * netdev-internal. */
1310 netdev_pseudo_get_stats(const struct netdev *netdev_,
1311 struct netdev_stats *stats)
1313 struct netdev_dev_linux *netdev_dev =
1314 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1315 struct netdev_stats dev_stats;
1318 get_stats_via_vport(netdev_, stats);
1320 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1322 if (!netdev_dev->have_vport_stats) {
1329 /* If this port is an internal port then the transmit and receive stats
1330 * will appear to be swapped relative to the other ports since we are the
1331 * one sending the data, not a remote computer. For consistency, we swap
1332 * them back here. This does not apply if we are getting stats from the
1333 * vport layer because it always tracks stats from the perspective of the
1335 if (!netdev_dev->have_vport_stats) {
1337 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1338 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1339 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1340 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1341 stats->rx_length_errors = 0;
1342 stats->rx_over_errors = 0;
1343 stats->rx_crc_errors = 0;
1344 stats->rx_frame_errors = 0;
1345 stats->rx_fifo_errors = 0;
1346 stats->rx_missed_errors = 0;
1347 stats->tx_aborted_errors = 0;
1348 stats->tx_carrier_errors = 0;
1349 stats->tx_fifo_errors = 0;
1350 stats->tx_heartbeat_errors = 0;
1351 stats->tx_window_errors = 0;
1353 stats->rx_dropped += dev_stats.tx_dropped;
1354 stats->tx_dropped += dev_stats.rx_dropped;
1356 stats->rx_errors += dev_stats.tx_errors;
1357 stats->tx_errors += dev_stats.rx_errors;
1359 stats->multicast += dev_stats.multicast;
1360 stats->collisions += dev_stats.collisions;
1365 /* Stores the features supported by 'netdev' into each of '*current',
1366 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1367 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1368 * successful, otherwise a positive errno value. */
1370 netdev_linux_get_features(const struct netdev *netdev,
1371 uint32_t *current, uint32_t *advertised,
1372 uint32_t *supported, uint32_t *peer)
1374 struct ethtool_cmd ecmd;
1377 memset(&ecmd, 0, sizeof ecmd);
1378 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1379 ETHTOOL_GSET, "ETHTOOL_GSET");
1384 /* Supported features. */
1386 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1387 *supported |= OFPPF_10MB_HD;
1389 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1390 *supported |= OFPPF_10MB_FD;
1392 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1393 *supported |= OFPPF_100MB_HD;
1395 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1396 *supported |= OFPPF_100MB_FD;
1398 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1399 *supported |= OFPPF_1GB_HD;
1401 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1402 *supported |= OFPPF_1GB_FD;
1404 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1405 *supported |= OFPPF_10GB_FD;
1407 if (ecmd.supported & SUPPORTED_TP) {
1408 *supported |= OFPPF_COPPER;
1410 if (ecmd.supported & SUPPORTED_FIBRE) {
1411 *supported |= OFPPF_FIBER;
1413 if (ecmd.supported & SUPPORTED_Autoneg) {
1414 *supported |= OFPPF_AUTONEG;
1416 if (ecmd.supported & SUPPORTED_Pause) {
1417 *supported |= OFPPF_PAUSE;
1419 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1420 *supported |= OFPPF_PAUSE_ASYM;
1423 /* Advertised features. */
1425 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1426 *advertised |= OFPPF_10MB_HD;
1428 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1429 *advertised |= OFPPF_10MB_FD;
1431 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1432 *advertised |= OFPPF_100MB_HD;
1434 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1435 *advertised |= OFPPF_100MB_FD;
1437 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1438 *advertised |= OFPPF_1GB_HD;
1440 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1441 *advertised |= OFPPF_1GB_FD;
1443 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1444 *advertised |= OFPPF_10GB_FD;
1446 if (ecmd.advertising & ADVERTISED_TP) {
1447 *advertised |= OFPPF_COPPER;
1449 if (ecmd.advertising & ADVERTISED_FIBRE) {
1450 *advertised |= OFPPF_FIBER;
1452 if (ecmd.advertising & ADVERTISED_Autoneg) {
1453 *advertised |= OFPPF_AUTONEG;
1455 if (ecmd.advertising & ADVERTISED_Pause) {
1456 *advertised |= OFPPF_PAUSE;
1458 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1459 *advertised |= OFPPF_PAUSE_ASYM;
1462 /* Current settings. */
1463 if (ecmd.speed == SPEED_10) {
1464 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1465 } else if (ecmd.speed == SPEED_100) {
1466 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1467 } else if (ecmd.speed == SPEED_1000) {
1468 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1469 } else if (ecmd.speed == SPEED_10000) {
1470 *current = OFPPF_10GB_FD;
1475 if (ecmd.port == PORT_TP) {
1476 *current |= OFPPF_COPPER;
1477 } else if (ecmd.port == PORT_FIBRE) {
1478 *current |= OFPPF_FIBER;
1482 *current |= OFPPF_AUTONEG;
1485 /* Peer advertisements. */
1486 *peer = 0; /* XXX */
1491 /* Set the features advertised by 'netdev' to 'advertise'. */
1493 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1495 struct ethtool_cmd ecmd;
1498 memset(&ecmd, 0, sizeof ecmd);
1499 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1500 ETHTOOL_GSET, "ETHTOOL_GSET");
1505 ecmd.advertising = 0;
1506 if (advertise & OFPPF_10MB_HD) {
1507 ecmd.advertising |= ADVERTISED_10baseT_Half;
1509 if (advertise & OFPPF_10MB_FD) {
1510 ecmd.advertising |= ADVERTISED_10baseT_Full;
1512 if (advertise & OFPPF_100MB_HD) {
1513 ecmd.advertising |= ADVERTISED_100baseT_Half;
1515 if (advertise & OFPPF_100MB_FD) {
1516 ecmd.advertising |= ADVERTISED_100baseT_Full;
1518 if (advertise & OFPPF_1GB_HD) {
1519 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1521 if (advertise & OFPPF_1GB_FD) {
1522 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1524 if (advertise & OFPPF_10GB_FD) {
1525 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1527 if (advertise & OFPPF_COPPER) {
1528 ecmd.advertising |= ADVERTISED_TP;
1530 if (advertise & OFPPF_FIBER) {
1531 ecmd.advertising |= ADVERTISED_FIBRE;
1533 if (advertise & OFPPF_AUTONEG) {
1534 ecmd.advertising |= ADVERTISED_Autoneg;
1536 if (advertise & OFPPF_PAUSE) {
1537 ecmd.advertising |= ADVERTISED_Pause;
1539 if (advertise & OFPPF_PAUSE_ASYM) {
1540 ecmd.advertising |= ADVERTISED_Asym_Pause;
1542 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1543 ETHTOOL_SSET, "ETHTOOL_SSET");
1546 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1547 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1549 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1550 * positive errno value.
1552 * This function is equivalent to running
1553 * /sbin/tc qdisc del dev %s handle ffff: ingress
1554 * but it is much, much faster.
1557 netdev_linux_remove_policing(struct netdev *netdev)
1559 struct netdev_dev_linux *netdev_dev =
1560 netdev_dev_linux_cast(netdev_get_dev(netdev));
1561 const char *netdev_name = netdev_get_name(netdev);
1563 struct ofpbuf request;
1564 struct tcmsg *tcmsg;
1567 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1571 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1572 tcmsg->tcm_parent = TC_H_INGRESS;
1573 nl_msg_put_string(&request, TCA_KIND, "ingress");
1574 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1576 error = tc_transact(&request, NULL);
1577 if (error && error != ENOENT && error != EINVAL) {
1578 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1579 netdev_name, strerror(error));
1583 netdev_dev->kbits_rate = 0;
1584 netdev_dev->kbits_burst = 0;
1585 netdev_dev->cache_valid |= VALID_POLICING;
1589 /* Attempts to set input rate limiting (policing) policy. */
1591 netdev_linux_set_policing(struct netdev *netdev,
1592 uint32_t kbits_rate, uint32_t kbits_burst)
1594 struct netdev_dev_linux *netdev_dev =
1595 netdev_dev_linux_cast(netdev_get_dev(netdev));
1596 const char *netdev_name = netdev_get_name(netdev);
1599 COVERAGE_INC(netdev_set_policing);
1601 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1602 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1603 : kbits_burst); /* Stick with user-specified value. */
1605 if (netdev_dev->cache_valid & VALID_POLICING
1606 && netdev_dev->kbits_rate == kbits_rate
1607 && netdev_dev->kbits_burst == kbits_burst) {
1608 /* Assume that settings haven't changed since we last set them. */
1612 netdev_linux_remove_policing(netdev);
1614 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1615 if (system(command) != 0) {
1616 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1620 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1621 kbits_rate, kbits_burst);
1622 if (system(command) != 0) {
1623 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1628 netdev_dev->kbits_rate = kbits_rate;
1629 netdev_dev->kbits_burst = kbits_burst;
1630 netdev_dev->cache_valid |= VALID_POLICING;
1637 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1640 const struct tc_ops **opsp;
1642 for (opsp = tcs; *opsp != NULL; opsp++) {
1643 const struct tc_ops *ops = *opsp;
1644 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1645 sset_add(types, ops->ovs_name);
1651 static const struct tc_ops *
1652 tc_lookup_ovs_name(const char *name)
1654 const struct tc_ops **opsp;
1656 for (opsp = tcs; *opsp != NULL; opsp++) {
1657 const struct tc_ops *ops = *opsp;
1658 if (!strcmp(name, ops->ovs_name)) {
1665 static const struct tc_ops *
1666 tc_lookup_linux_name(const char *name)
1668 const struct tc_ops **opsp;
1670 for (opsp = tcs; *opsp != NULL; opsp++) {
1671 const struct tc_ops *ops = *opsp;
1672 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1679 static struct tc_queue *
1680 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1683 struct netdev_dev_linux *netdev_dev =
1684 netdev_dev_linux_cast(netdev_get_dev(netdev));
1685 struct tc_queue *queue;
1687 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1688 if (queue->queue_id == queue_id) {
1695 static struct tc_queue *
1696 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1698 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1702 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1704 struct netdev_qos_capabilities *caps)
1706 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1710 caps->n_queues = ops->n_queues;
1715 netdev_linux_get_qos(const struct netdev *netdev,
1716 const char **typep, struct shash *details)
1718 struct netdev_dev_linux *netdev_dev =
1719 netdev_dev_linux_cast(netdev_get_dev(netdev));
1722 error = tc_query_qdisc(netdev);
1727 *typep = netdev_dev->tc->ops->ovs_name;
1728 return (netdev_dev->tc->ops->qdisc_get
1729 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1734 netdev_linux_set_qos(struct netdev *netdev,
1735 const char *type, const struct shash *details)
1737 struct netdev_dev_linux *netdev_dev =
1738 netdev_dev_linux_cast(netdev_get_dev(netdev));
1739 const struct tc_ops *new_ops;
1742 new_ops = tc_lookup_ovs_name(type);
1743 if (!new_ops || !new_ops->tc_install) {
1747 error = tc_query_qdisc(netdev);
1752 if (new_ops == netdev_dev->tc->ops) {
1753 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1755 /* Delete existing qdisc. */
1756 error = tc_del_qdisc(netdev);
1760 assert(netdev_dev->tc == NULL);
1762 /* Install new qdisc. */
1763 error = new_ops->tc_install(netdev, details);
1764 assert((error == 0) == (netdev_dev->tc != NULL));
1771 netdev_linux_get_queue(const struct netdev *netdev,
1772 unsigned int queue_id, struct shash *details)
1774 struct netdev_dev_linux *netdev_dev =
1775 netdev_dev_linux_cast(netdev_get_dev(netdev));
1778 error = tc_query_qdisc(netdev);
1782 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1784 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1790 netdev_linux_set_queue(struct netdev *netdev,
1791 unsigned int queue_id, const struct shash *details)
1793 struct netdev_dev_linux *netdev_dev =
1794 netdev_dev_linux_cast(netdev_get_dev(netdev));
1797 error = tc_query_qdisc(netdev);
1800 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1801 || !netdev_dev->tc->ops->class_set) {
1805 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1809 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1811 struct netdev_dev_linux *netdev_dev =
1812 netdev_dev_linux_cast(netdev_get_dev(netdev));
1815 error = tc_query_qdisc(netdev);
1818 } else if (!netdev_dev->tc->ops->class_delete) {
1821 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1823 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1829 netdev_linux_get_queue_stats(const struct netdev *netdev,
1830 unsigned int queue_id,
1831 struct netdev_queue_stats *stats)
1833 struct netdev_dev_linux *netdev_dev =
1834 netdev_dev_linux_cast(netdev_get_dev(netdev));
1837 error = tc_query_qdisc(netdev);
1840 } else if (!netdev_dev->tc->ops->class_get_stats) {
1843 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1845 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1851 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1853 struct ofpbuf request;
1854 struct tcmsg *tcmsg;
1856 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1860 tcmsg->tcm_parent = 0;
1861 nl_dump_start(dump, rtnl_sock, &request);
1862 ofpbuf_uninit(&request);
1867 netdev_linux_dump_queues(const struct netdev *netdev,
1868 netdev_dump_queues_cb *cb, void *aux)
1870 struct netdev_dev_linux *netdev_dev =
1871 netdev_dev_linux_cast(netdev_get_dev(netdev));
1872 struct tc_queue *queue;
1873 struct shash details;
1877 error = tc_query_qdisc(netdev);
1880 } else if (!netdev_dev->tc->ops->class_get) {
1885 shash_init(&details);
1886 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1887 shash_clear(&details);
1889 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1891 (*cb)(queue->queue_id, &details, aux);
1896 shash_destroy(&details);
1902 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1903 netdev_dump_queue_stats_cb *cb, void *aux)
1905 struct netdev_dev_linux *netdev_dev =
1906 netdev_dev_linux_cast(netdev_get_dev(netdev));
1907 struct nl_dump dump;
1912 error = tc_query_qdisc(netdev);
1915 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1920 if (!start_queue_dump(netdev, &dump)) {
1923 while (nl_dump_next(&dump, &msg)) {
1924 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1930 error = nl_dump_done(&dump);
1931 return error ? error : last_error;
1935 netdev_linux_get_in4(const struct netdev *netdev_,
1936 struct in_addr *address, struct in_addr *netmask)
1938 struct netdev_dev_linux *netdev_dev =
1939 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1941 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1944 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1945 SIOCGIFADDR, "SIOCGIFADDR");
1950 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1951 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1956 netdev_dev->cache_valid |= VALID_IN4;
1958 *address = netdev_dev->address;
1959 *netmask = netdev_dev->netmask;
1960 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1964 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1965 struct in_addr netmask)
1967 struct netdev_dev_linux *netdev_dev =
1968 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1971 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1973 netdev_dev->cache_valid |= VALID_IN4;
1974 netdev_dev->address = address;
1975 netdev_dev->netmask = netmask;
1976 if (address.s_addr != INADDR_ANY) {
1977 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1978 "SIOCSIFNETMASK", netmask);
1985 parse_if_inet6_line(const char *line,
1986 struct in6_addr *in6, char ifname[16 + 1])
1988 uint8_t *s6 = in6->s6_addr;
1989 #define X8 "%2"SCNx8
1991 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1992 "%*x %*x %*x %*x %16s\n",
1993 &s6[0], &s6[1], &s6[2], &s6[3],
1994 &s6[4], &s6[5], &s6[6], &s6[7],
1995 &s6[8], &s6[9], &s6[10], &s6[11],
1996 &s6[12], &s6[13], &s6[14], &s6[15],
2000 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2001 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2003 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2005 struct netdev_dev_linux *netdev_dev =
2006 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2007 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2011 netdev_dev->in6 = in6addr_any;
2013 file = fopen("/proc/net/if_inet6", "r");
2015 const char *name = netdev_get_name(netdev_);
2016 while (fgets(line, sizeof line, file)) {
2017 struct in6_addr in6_tmp;
2018 char ifname[16 + 1];
2019 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2020 && !strcmp(name, ifname))
2022 netdev_dev->in6 = in6_tmp;
2028 netdev_dev->cache_valid |= VALID_IN6;
2030 *in6 = netdev_dev->in6;
2035 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2037 struct sockaddr_in sin;
2038 memset(&sin, 0, sizeof sin);
2039 sin.sin_family = AF_INET;
2040 sin.sin_addr = addr;
2043 memset(sa, 0, sizeof *sa);
2044 memcpy(sa, &sin, sizeof sin);
2048 do_set_addr(struct netdev *netdev,
2049 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2052 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2053 make_in4_sockaddr(&ifr.ifr_addr, addr);
2055 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2059 /* Adds 'router' as a default IP gateway. */
2061 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2063 struct in_addr any = { INADDR_ANY };
2067 memset(&rt, 0, sizeof rt);
2068 make_in4_sockaddr(&rt.rt_dst, any);
2069 make_in4_sockaddr(&rt.rt_gateway, router);
2070 make_in4_sockaddr(&rt.rt_genmask, any);
2071 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2072 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2074 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2080 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2083 static const char fn[] = "/proc/net/route";
2088 *netdev_name = NULL;
2089 stream = fopen(fn, "r");
2090 if (stream == NULL) {
2091 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2096 while (fgets(line, sizeof line, stream)) {
2099 ovs_be32 dest, gateway, mask;
2100 int refcnt, metric, mtu;
2101 unsigned int flags, use, window, irtt;
2104 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2106 iface, &dest, &gateway, &flags, &refcnt,
2107 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2109 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2113 if (!(flags & RTF_UP)) {
2114 /* Skip routes that aren't up. */
2118 /* The output of 'dest', 'mask', and 'gateway' were given in
2119 * network byte order, so we don't need need any endian
2120 * conversions here. */
2121 if ((dest & mask) == (host->s_addr & mask)) {
2123 /* The host is directly reachable. */
2124 next_hop->s_addr = 0;
2126 /* To reach the host, we must go through a gateway. */
2127 next_hop->s_addr = gateway;
2129 *netdev_name = xstrdup(iface);
2141 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2143 struct ethtool_drvinfo drvinfo;
2146 memset(&drvinfo, 0, sizeof drvinfo);
2147 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2148 (struct ethtool_cmd *)&drvinfo,
2150 "ETHTOOL_GDRVINFO");
2152 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2153 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2154 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2160 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2161 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2162 * returns 0. Otherwise, it returns a positive errno value; in particular,
2163 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2165 netdev_linux_arp_lookup(const struct netdev *netdev,
2166 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2169 struct sockaddr_in sin;
2172 memset(&r, 0, sizeof r);
2173 memset(&sin, 0, sizeof sin);
2174 sin.sin_family = AF_INET;
2175 sin.sin_addr.s_addr = ip;
2177 memcpy(&r.arp_pa, &sin, sizeof sin);
2178 r.arp_ha.sa_family = ARPHRD_ETHER;
2180 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2181 COVERAGE_INC(netdev_arp_lookup);
2182 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2184 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2185 } else if (retval != ENXIO) {
2186 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2187 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2193 nd_to_iff_flags(enum netdev_flags nd)
2196 if (nd & NETDEV_UP) {
2199 if (nd & NETDEV_PROMISC) {
2206 iff_to_nd_flags(int iff)
2208 enum netdev_flags nd = 0;
2212 if (iff & IFF_PROMISC) {
2213 nd |= NETDEV_PROMISC;
2219 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2220 enum netdev_flags on, enum netdev_flags *old_flagsp)
2222 int old_flags, new_flags;
2225 error = get_flags(netdev, &old_flags);
2227 *old_flagsp = iff_to_nd_flags(old_flags);
2228 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2229 if (new_flags != old_flags) {
2230 error = set_flags(netdev, new_flags);
2237 netdev_linux_change_seq(const struct netdev *netdev)
2239 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2242 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2246 netdev_linux_init, \
2248 netdev_linux_wait, \
2251 netdev_linux_destroy, \
2252 NULL, /* get_config */ \
2253 NULL, /* set_config */ \
2255 netdev_linux_open, \
2256 netdev_linux_close, \
2258 netdev_linux_listen, \
2259 netdev_linux_recv, \
2260 netdev_linux_recv_wait, \
2261 netdev_linux_drain, \
2263 netdev_linux_send, \
2264 netdev_linux_send_wait, \
2266 netdev_linux_set_etheraddr, \
2267 netdev_linux_get_etheraddr, \
2268 netdev_linux_get_mtu, \
2269 netdev_linux_set_mtu, \
2270 netdev_linux_get_ifindex, \
2271 netdev_linux_get_carrier, \
2272 netdev_linux_get_carrier_resets, \
2273 netdev_linux_set_miimon_interval, \
2277 netdev_linux_get_features, \
2278 netdev_linux_set_advertisements, \
2280 netdev_linux_set_policing, \
2281 netdev_linux_get_qos_types, \
2282 netdev_linux_get_qos_capabilities, \
2283 netdev_linux_get_qos, \
2284 netdev_linux_set_qos, \
2285 netdev_linux_get_queue, \
2286 netdev_linux_set_queue, \
2287 netdev_linux_delete_queue, \
2288 netdev_linux_get_queue_stats, \
2289 netdev_linux_dump_queues, \
2290 netdev_linux_dump_queue_stats, \
2292 netdev_linux_get_in4, \
2293 netdev_linux_set_in4, \
2294 netdev_linux_get_in6, \
2295 netdev_linux_add_router, \
2296 netdev_linux_get_next_hop, \
2297 netdev_linux_get_status, \
2298 netdev_linux_arp_lookup, \
2300 netdev_linux_update_flags, \
2302 netdev_linux_change_seq \
2305 const struct netdev_class netdev_linux_class =
2308 netdev_linux_create,
2309 netdev_linux_get_stats,
2310 NULL); /* set_stats */
2312 const struct netdev_class netdev_tap_class =
2315 netdev_linux_create_tap,
2316 netdev_pseudo_get_stats,
2317 NULL); /* set_stats */
2319 const struct netdev_class netdev_internal_class =
2322 netdev_linux_create,
2323 netdev_pseudo_get_stats,
2324 netdev_vport_set_stats);
2326 /* HTB traffic control class. */
2328 #define HTB_N_QUEUES 0xf000
2332 unsigned int max_rate; /* In bytes/s. */
2336 struct tc_queue tc_queue;
2337 unsigned int min_rate; /* In bytes/s. */
2338 unsigned int max_rate; /* In bytes/s. */
2339 unsigned int burst; /* In bytes. */
2340 unsigned int priority; /* Lower values are higher priorities. */
2344 htb_get__(const struct netdev *netdev)
2346 struct netdev_dev_linux *netdev_dev =
2347 netdev_dev_linux_cast(netdev_get_dev(netdev));
2348 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2352 htb_install__(struct netdev *netdev, uint64_t max_rate)
2354 struct netdev_dev_linux *netdev_dev =
2355 netdev_dev_linux_cast(netdev_get_dev(netdev));
2358 htb = xmalloc(sizeof *htb);
2359 tc_init(&htb->tc, &tc_ops_htb);
2360 htb->max_rate = max_rate;
2362 netdev_dev->tc = &htb->tc;
2365 /* Create an HTB qdisc.
2367 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2369 htb_setup_qdisc__(struct netdev *netdev)
2372 struct tc_htb_glob opt;
2373 struct ofpbuf request;
2374 struct tcmsg *tcmsg;
2376 tc_del_qdisc(netdev);
2378 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2379 NLM_F_EXCL | NLM_F_CREATE, &request);
2383 tcmsg->tcm_handle = tc_make_handle(1, 0);
2384 tcmsg->tcm_parent = TC_H_ROOT;
2386 nl_msg_put_string(&request, TCA_KIND, "htb");
2388 memset(&opt, 0, sizeof opt);
2389 opt.rate2quantum = 10;
2393 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2394 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2395 nl_msg_end_nested(&request, opt_offset);
2397 return tc_transact(&request, NULL);
2400 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2401 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2403 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2404 unsigned int parent, struct htb_class *class)
2407 struct tc_htb_opt opt;
2408 struct ofpbuf request;
2409 struct tcmsg *tcmsg;
2413 error = netdev_get_mtu(netdev, &mtu);
2415 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2416 netdev_get_name(netdev));
2420 memset(&opt, 0, sizeof opt);
2421 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2422 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2423 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2424 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2425 opt.prio = class->priority;
2427 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2431 tcmsg->tcm_handle = handle;
2432 tcmsg->tcm_parent = parent;
2434 nl_msg_put_string(&request, TCA_KIND, "htb");
2435 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2436 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2437 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2438 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2439 nl_msg_end_nested(&request, opt_offset);
2441 error = tc_transact(&request, NULL);
2443 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2444 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2445 netdev_get_name(netdev),
2446 tc_get_major(handle), tc_get_minor(handle),
2447 tc_get_major(parent), tc_get_minor(parent),
2448 class->min_rate, class->max_rate,
2449 class->burst, class->priority, strerror(error));
2454 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2455 * description of them into 'details'. The description complies with the
2456 * specification given in the vswitch database documentation for linux-htb
2459 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2461 static const struct nl_policy tca_htb_policy[] = {
2462 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2463 .min_len = sizeof(struct tc_htb_opt) },
2466 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2467 const struct tc_htb_opt *htb;
2469 if (!nl_parse_nested(nl_options, tca_htb_policy,
2470 attrs, ARRAY_SIZE(tca_htb_policy))) {
2471 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2475 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2476 class->min_rate = htb->rate.rate;
2477 class->max_rate = htb->ceil.rate;
2478 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2479 class->priority = htb->prio;
2484 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2485 struct htb_class *options,
2486 struct netdev_queue_stats *stats)
2488 struct nlattr *nl_options;
2489 unsigned int handle;
2492 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2493 if (!error && queue_id) {
2494 unsigned int major = tc_get_major(handle);
2495 unsigned int minor = tc_get_minor(handle);
2496 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2497 *queue_id = minor - 1;
2502 if (!error && options) {
2503 error = htb_parse_tca_options__(nl_options, options);
2509 htb_parse_qdisc_details__(struct netdev *netdev,
2510 const struct shash *details, struct htb_class *hc)
2512 const char *max_rate_s;
2514 max_rate_s = shash_find_data(details, "max-rate");
2515 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2516 if (!hc->max_rate) {
2519 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2520 hc->max_rate = netdev_features_to_bps(current) / 8;
2522 hc->min_rate = hc->max_rate;
2528 htb_parse_class_details__(struct netdev *netdev,
2529 const struct shash *details, struct htb_class *hc)
2531 const struct htb *htb = htb_get__(netdev);
2532 const char *min_rate_s = shash_find_data(details, "min-rate");
2533 const char *max_rate_s = shash_find_data(details, "max-rate");
2534 const char *burst_s = shash_find_data(details, "burst");
2535 const char *priority_s = shash_find_data(details, "priority");
2538 error = netdev_get_mtu(netdev, &mtu);
2540 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2541 netdev_get_name(netdev));
2545 /* HTB requires at least an mtu sized min-rate to send any traffic even
2546 * on uncongested links. */
2547 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2548 hc->min_rate = MAX(hc->min_rate, mtu);
2549 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2552 hc->max_rate = (max_rate_s
2553 ? strtoull(max_rate_s, NULL, 10) / 8
2555 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2556 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2560 * According to hints in the documentation that I've read, it is important
2561 * that 'burst' be at least as big as the largest frame that might be
2562 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2563 * but having it a bit too small is a problem. Since netdev_get_mtu()
2564 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2565 * the MTU. We actually add 64, instead of 14, as a guard against
2566 * additional headers get tacked on somewhere that we're not aware of. */
2567 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2568 hc->burst = MAX(hc->burst, mtu + 64);
2571 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2577 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2578 unsigned int parent, struct htb_class *options,
2579 struct netdev_queue_stats *stats)
2581 struct ofpbuf *reply;
2584 error = tc_query_class(netdev, handle, parent, &reply);
2586 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2587 ofpbuf_delete(reply);
2593 htb_tc_install(struct netdev *netdev, const struct shash *details)
2597 error = htb_setup_qdisc__(netdev);
2599 struct htb_class hc;
2601 htb_parse_qdisc_details__(netdev, details, &hc);
2602 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2603 tc_make_handle(1, 0), &hc);
2605 htb_install__(netdev, hc.max_rate);
2611 static struct htb_class *
2612 htb_class_cast__(const struct tc_queue *queue)
2614 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2618 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2619 const struct htb_class *hc)
2621 struct htb *htb = htb_get__(netdev);
2622 size_t hash = hash_int(queue_id, 0);
2623 struct tc_queue *queue;
2624 struct htb_class *hcp;
2626 queue = tc_find_queue__(netdev, queue_id, hash);
2628 hcp = htb_class_cast__(queue);
2630 hcp = xmalloc(sizeof *hcp);
2631 queue = &hcp->tc_queue;
2632 queue->queue_id = queue_id;
2633 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2636 hcp->min_rate = hc->min_rate;
2637 hcp->max_rate = hc->max_rate;
2638 hcp->burst = hc->burst;
2639 hcp->priority = hc->priority;
2643 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2646 struct nl_dump dump;
2647 struct htb_class hc;
2649 /* Get qdisc options. */
2651 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2652 htb_install__(netdev, hc.max_rate);
2655 if (!start_queue_dump(netdev, &dump)) {
2658 while (nl_dump_next(&dump, &msg)) {
2659 unsigned int queue_id;
2661 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2662 htb_update_queue__(netdev, queue_id, &hc);
2665 nl_dump_done(&dump);
2671 htb_tc_destroy(struct tc *tc)
2673 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2674 struct htb_class *hc, *next;
2676 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2677 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2685 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2687 const struct htb *htb = htb_get__(netdev);
2688 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2693 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2695 struct htb_class hc;
2698 htb_parse_qdisc_details__(netdev, details, &hc);
2699 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2700 tc_make_handle(1, 0), &hc);
2702 htb_get__(netdev)->max_rate = hc.max_rate;
2708 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2709 const struct tc_queue *queue, struct shash *details)
2711 const struct htb_class *hc = htb_class_cast__(queue);
2713 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2714 if (hc->min_rate != hc->max_rate) {
2715 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2717 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2719 shash_add(details, "priority", xasprintf("%u", hc->priority));
2725 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2726 const struct shash *details)
2728 struct htb_class hc;
2731 error = htb_parse_class_details__(netdev, details, &hc);
2736 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2737 tc_make_handle(1, 0xfffe), &hc);
2742 htb_update_queue__(netdev, queue_id, &hc);
2747 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2749 struct htb_class *hc = htb_class_cast__(queue);
2750 struct htb *htb = htb_get__(netdev);
2753 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2755 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2762 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2763 struct netdev_queue_stats *stats)
2765 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2766 tc_make_handle(1, 0xfffe), NULL, stats);
2770 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2771 const struct ofpbuf *nlmsg,
2772 netdev_dump_queue_stats_cb *cb, void *aux)
2774 struct netdev_queue_stats stats;
2775 unsigned int handle, major, minor;
2778 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2783 major = tc_get_major(handle);
2784 minor = tc_get_minor(handle);
2785 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2786 (*cb)(minor - 1, &stats, aux);
2791 static const struct tc_ops tc_ops_htb = {
2792 "htb", /* linux_name */
2793 "linux-htb", /* ovs_name */
2794 HTB_N_QUEUES, /* n_queues */
2803 htb_class_get_stats,
2804 htb_class_dump_stats
2807 /* "linux-hfsc" traffic control class. */
2809 #define HFSC_N_QUEUES 0xf000
2817 struct tc_queue tc_queue;
2822 static struct hfsc *
2823 hfsc_get__(const struct netdev *netdev)
2825 struct netdev_dev_linux *netdev_dev;
2826 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2827 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2830 static struct hfsc_class *
2831 hfsc_class_cast__(const struct tc_queue *queue)
2833 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2837 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2839 struct netdev_dev_linux * netdev_dev;
2842 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2843 hfsc = xmalloc(sizeof *hfsc);
2844 tc_init(&hfsc->tc, &tc_ops_hfsc);
2845 hfsc->max_rate = max_rate;
2846 netdev_dev->tc = &hfsc->tc;
2850 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2851 const struct hfsc_class *hc)
2855 struct hfsc_class *hcp;
2856 struct tc_queue *queue;
2858 hfsc = hfsc_get__(netdev);
2859 hash = hash_int(queue_id, 0);
2861 queue = tc_find_queue__(netdev, queue_id, hash);
2863 hcp = hfsc_class_cast__(queue);
2865 hcp = xmalloc(sizeof *hcp);
2866 queue = &hcp->tc_queue;
2867 queue->queue_id = queue_id;
2868 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2871 hcp->min_rate = hc->min_rate;
2872 hcp->max_rate = hc->max_rate;
2876 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2878 const struct tc_service_curve *rsc, *fsc, *usc;
2879 static const struct nl_policy tca_hfsc_policy[] = {
2881 .type = NL_A_UNSPEC,
2883 .min_len = sizeof(struct tc_service_curve),
2886 .type = NL_A_UNSPEC,
2888 .min_len = sizeof(struct tc_service_curve),
2891 .type = NL_A_UNSPEC,
2893 .min_len = sizeof(struct tc_service_curve),
2896 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2898 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2899 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2900 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2904 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2905 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2906 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2908 if (rsc->m1 != 0 || rsc->d != 0 ||
2909 fsc->m1 != 0 || fsc->d != 0 ||
2910 usc->m1 != 0 || usc->d != 0) {
2911 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2912 "Non-linear service curves are not supported.");
2916 if (rsc->m2 != fsc->m2) {
2917 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2918 "Real-time service curves are not supported ");
2922 if (rsc->m2 > usc->m2) {
2923 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2924 "Min-rate service curve is greater than "
2925 "the max-rate service curve.");
2929 class->min_rate = fsc->m2;
2930 class->max_rate = usc->m2;
2935 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2936 struct hfsc_class *options,
2937 struct netdev_queue_stats *stats)
2940 unsigned int handle;
2941 struct nlattr *nl_options;
2943 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2949 unsigned int major, minor;
2951 major = tc_get_major(handle);
2952 minor = tc_get_minor(handle);
2953 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2954 *queue_id = minor - 1;
2961 error = hfsc_parse_tca_options__(nl_options, options);
2968 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2969 unsigned int parent, struct hfsc_class *options,
2970 struct netdev_queue_stats *stats)
2973 struct ofpbuf *reply;
2975 error = tc_query_class(netdev, handle, parent, &reply);
2980 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2981 ofpbuf_delete(reply);
2986 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2987 struct hfsc_class *class)
2990 const char *max_rate_s;
2992 max_rate_s = shash_find_data(details, "max-rate");
2993 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2998 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2999 max_rate = netdev_features_to_bps(current) / 8;
3002 class->min_rate = max_rate;
3003 class->max_rate = max_rate;
3007 hfsc_parse_class_details__(struct netdev *netdev,
3008 const struct shash *details,
3009 struct hfsc_class * class)
3011 const struct hfsc *hfsc;
3012 uint32_t min_rate, max_rate;
3013 const char *min_rate_s, *max_rate_s;
3015 hfsc = hfsc_get__(netdev);
3016 min_rate_s = shash_find_data(details, "min-rate");
3017 max_rate_s = shash_find_data(details, "max-rate");
3019 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3020 min_rate = MAX(min_rate, 1);
3021 min_rate = MIN(min_rate, hfsc->max_rate);
3023 max_rate = (max_rate_s
3024 ? strtoull(max_rate_s, NULL, 10) / 8
3026 max_rate = MAX(max_rate, min_rate);
3027 max_rate = MIN(max_rate, hfsc->max_rate);
3029 class->min_rate = min_rate;
3030 class->max_rate = max_rate;
3035 /* Create an HFSC qdisc.
3037 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3039 hfsc_setup_qdisc__(struct netdev * netdev)
3041 struct tcmsg *tcmsg;
3042 struct ofpbuf request;
3043 struct tc_hfsc_qopt opt;
3045 tc_del_qdisc(netdev);
3047 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3048 NLM_F_EXCL | NLM_F_CREATE, &request);
3054 tcmsg->tcm_handle = tc_make_handle(1, 0);
3055 tcmsg->tcm_parent = TC_H_ROOT;
3057 memset(&opt, 0, sizeof opt);
3060 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3061 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3063 return tc_transact(&request, NULL);
3066 /* Create an HFSC class.
3068 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3069 * sc rate <min_rate> ul rate <max_rate>" */
3071 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3072 unsigned int parent, struct hfsc_class *class)
3076 struct tcmsg *tcmsg;
3077 struct ofpbuf request;
3078 struct tc_service_curve min, max;
3080 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3086 tcmsg->tcm_handle = handle;
3087 tcmsg->tcm_parent = parent;
3091 min.m2 = class->min_rate;
3095 max.m2 = class->max_rate;
3097 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3098 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3099 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3100 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3101 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3102 nl_msg_end_nested(&request, opt_offset);
3104 error = tc_transact(&request, NULL);
3106 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3107 "min-rate %ubps, max-rate %ubps (%s)",
3108 netdev_get_name(netdev),
3109 tc_get_major(handle), tc_get_minor(handle),
3110 tc_get_major(parent), tc_get_minor(parent),
3111 class->min_rate, class->max_rate, strerror(error));
3118 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3121 struct hfsc_class class;
3123 error = hfsc_setup_qdisc__(netdev);
3129 hfsc_parse_qdisc_details__(netdev, details, &class);
3130 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3131 tc_make_handle(1, 0), &class);
3137 hfsc_install__(netdev, class.max_rate);
3142 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3145 struct nl_dump dump;
3146 struct hfsc_class hc;
3149 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3150 hfsc_install__(netdev, hc.max_rate);
3152 if (!start_queue_dump(netdev, &dump)) {
3156 while (nl_dump_next(&dump, &msg)) {
3157 unsigned int queue_id;
3159 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3160 hfsc_update_queue__(netdev, queue_id, &hc);
3164 nl_dump_done(&dump);
3169 hfsc_tc_destroy(struct tc *tc)
3172 struct hfsc_class *hc, *next;
3174 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3176 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3177 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3186 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3188 const struct hfsc *hfsc;
3189 hfsc = hfsc_get__(netdev);
3190 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3195 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3198 struct hfsc_class class;
3200 hfsc_parse_qdisc_details__(netdev, details, &class);
3201 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3202 tc_make_handle(1, 0), &class);
3205 hfsc_get__(netdev)->max_rate = class.max_rate;
3212 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3213 const struct tc_queue *queue, struct shash *details)
3215 const struct hfsc_class *hc;
3217 hc = hfsc_class_cast__(queue);
3218 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3219 if (hc->min_rate != hc->max_rate) {
3220 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3226 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3227 const struct shash *details)
3230 struct hfsc_class class;
3232 error = hfsc_parse_class_details__(netdev, details, &class);
3237 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3238 tc_make_handle(1, 0xfffe), &class);
3243 hfsc_update_queue__(netdev, queue_id, &class);
3248 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3252 struct hfsc_class *hc;
3254 hc = hfsc_class_cast__(queue);
3255 hfsc = hfsc_get__(netdev);
3257 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3259 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3266 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3267 struct netdev_queue_stats *stats)
3269 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3270 tc_make_handle(1, 0xfffe), NULL, stats);
3274 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3275 const struct ofpbuf *nlmsg,
3276 netdev_dump_queue_stats_cb *cb, void *aux)
3278 struct netdev_queue_stats stats;
3279 unsigned int handle, major, minor;
3282 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3287 major = tc_get_major(handle);
3288 minor = tc_get_minor(handle);
3289 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3290 (*cb)(minor - 1, &stats, aux);
3295 static const struct tc_ops tc_ops_hfsc = {
3296 "hfsc", /* linux_name */
3297 "linux-hfsc", /* ovs_name */
3298 HFSC_N_QUEUES, /* n_queues */
3299 hfsc_tc_install, /* tc_install */
3300 hfsc_tc_load, /* tc_load */
3301 hfsc_tc_destroy, /* tc_destroy */
3302 hfsc_qdisc_get, /* qdisc_get */
3303 hfsc_qdisc_set, /* qdisc_set */
3304 hfsc_class_get, /* class_get */
3305 hfsc_class_set, /* class_set */
3306 hfsc_class_delete, /* class_delete */
3307 hfsc_class_get_stats, /* class_get_stats */
3308 hfsc_class_dump_stats /* class_dump_stats */
3311 /* "linux-default" traffic control class.
3313 * This class represents the default, unnamed Linux qdisc. It corresponds to
3314 * the "" (empty string) QoS type in the OVS database. */
3317 default_install__(struct netdev *netdev)
3319 struct netdev_dev_linux *netdev_dev =
3320 netdev_dev_linux_cast(netdev_get_dev(netdev));
3321 static struct tc *tc;
3324 tc = xmalloc(sizeof *tc);
3325 tc_init(tc, &tc_ops_default);
3327 netdev_dev->tc = tc;
3331 default_tc_install(struct netdev *netdev,
3332 const struct shash *details OVS_UNUSED)
3334 default_install__(netdev);
3339 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3341 default_install__(netdev);
3345 static const struct tc_ops tc_ops_default = {
3346 NULL, /* linux_name */
3351 NULL, /* tc_destroy */
3352 NULL, /* qdisc_get */
3353 NULL, /* qdisc_set */
3354 NULL, /* class_get */
3355 NULL, /* class_set */
3356 NULL, /* class_delete */
3357 NULL, /* class_get_stats */
3358 NULL /* class_dump_stats */
3361 /* "linux-other" traffic control class.
3366 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3368 struct netdev_dev_linux *netdev_dev =
3369 netdev_dev_linux_cast(netdev_get_dev(netdev));
3370 static struct tc *tc;
3373 tc = xmalloc(sizeof *tc);
3374 tc_init(tc, &tc_ops_other);
3376 netdev_dev->tc = tc;
3380 static const struct tc_ops tc_ops_other = {
3381 NULL, /* linux_name */
3382 "linux-other", /* ovs_name */
3384 NULL, /* tc_install */
3386 NULL, /* tc_destroy */
3387 NULL, /* qdisc_get */
3388 NULL, /* qdisc_set */
3389 NULL, /* class_get */
3390 NULL, /* class_set */
3391 NULL, /* class_delete */
3392 NULL, /* class_get_stats */
3393 NULL /* class_dump_stats */
3396 /* Traffic control. */
3398 /* Number of kernel "tc" ticks per second. */
3399 static double ticks_per_s;
3401 /* Number of kernel "jiffies" per second. This is used for the purpose of
3402 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3403 * one jiffy's worth of data.
3405 * There are two possibilities here:
3407 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3408 * approximate range of 100 to 1024. That means that we really need to
3409 * make sure that the qdisc can buffer that much data.
3411 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3412 * has finely granular timers and there's no need to fudge additional room
3413 * for buffers. (There's no extra effort needed to implement that: the
3414 * large 'buffer_hz' is used as a divisor, so practically any number will
3415 * come out as 0 in the division. Small integer results in the case of
3416 * really high dividends won't have any real effect anyhow.)
3418 static unsigned int buffer_hz;
3420 /* Returns tc handle 'major':'minor'. */
3422 tc_make_handle(unsigned int major, unsigned int minor)
3424 return TC_H_MAKE(major << 16, minor);
3427 /* Returns the major number from 'handle'. */
3429 tc_get_major(unsigned int handle)
3431 return TC_H_MAJ(handle) >> 16;
3434 /* Returns the minor number from 'handle'. */
3436 tc_get_minor(unsigned int handle)
3438 return TC_H_MIN(handle);
3441 static struct tcmsg *
3442 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3443 struct ofpbuf *request)
3445 struct tcmsg *tcmsg;
3449 error = get_ifindex(netdev, &ifindex);
3454 ofpbuf_init(request, 512);
3455 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3456 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3457 tcmsg->tcm_family = AF_UNSPEC;
3458 tcmsg->tcm_ifindex = ifindex;
3459 /* Caller should fill in tcmsg->tcm_handle. */
3460 /* Caller should fill in tcmsg->tcm_parent. */
3466 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3468 int error = nl_sock_transact(rtnl_sock, request, replyp);
3469 ofpbuf_uninit(request);
3476 /* The values in psched are not individually very meaningful, but they are
3477 * important. The tables below show some values seen in the wild.
3481 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3482 * (Before that, there are hints that it was 1000000000.)
3484 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3488 * -----------------------------------
3489 * [1] 000c8000 000f4240 000f4240 00000064
3490 * [2] 000003e8 00000400 000f4240 3b9aca00
3491 * [3] 000003e8 00000400 000f4240 3b9aca00
3492 * [4] 000003e8 00000400 000f4240 00000064
3493 * [5] 000003e8 00000040 000f4240 3b9aca00
3494 * [6] 000003e8 00000040 000f4240 000000f9
3496 * a b c d ticks_per_s buffer_hz
3497 * ------- --------- ---------- ------------- ----------- -------------
3498 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3499 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3500 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3501 * [4] 1,000 1,024 1,000,000 100 976,562 100
3502 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3503 * [6] 1,000 64 1,000,000 249 15,625,000 249
3505 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3506 * [2] 2.6.26-1-686-bigmem from Debian lenny
3507 * [3] 2.6.26-2-sparc64 from Debian lenny
3508 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3509 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3510 * [6] 2.6.34 from kernel.org on KVM
3512 static const char fn[] = "/proc/net/psched";
3513 unsigned int a, b, c, d;
3519 stream = fopen(fn, "r");
3521 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3525 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3526 VLOG_WARN("%s: read failed", fn);
3530 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3534 VLOG_WARN("%s: invalid scheduler parameters", fn);
3538 ticks_per_s = (double) a * c / b;
3542 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3545 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3548 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3549 * rate of 'rate' bytes per second. */
3551 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3556 return (rate * ticks) / ticks_per_s;
3559 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3560 * rate of 'rate' bytes per second. */
3562 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3567 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3570 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3571 * a transmission rate of 'rate' bytes per second. */
3573 tc_buffer_per_jiffy(unsigned int rate)
3578 return rate / buffer_hz;
3581 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3582 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3583 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3584 * stores NULL into it if it is absent.
3586 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3589 * Returns 0 if successful, otherwise a positive errno value. */
3591 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3592 struct nlattr **options)
3594 static const struct nl_policy tca_policy[] = {
3595 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3596 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3598 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3600 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3601 tca_policy, ta, ARRAY_SIZE(ta))) {
3602 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3607 *kind = nl_attr_get_string(ta[TCA_KIND]);
3611 *options = ta[TCA_OPTIONS];
3626 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3627 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3628 * into '*options', and its queue statistics into '*stats'. Any of the output
3629 * arguments may be null.
3631 * Returns 0 if successful, otherwise a positive errno value. */
3633 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3634 struct nlattr **options, struct netdev_queue_stats *stats)
3636 static const struct nl_policy tca_policy[] = {
3637 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3638 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3640 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3642 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3643 tca_policy, ta, ARRAY_SIZE(ta))) {
3644 VLOG_WARN_RL(&rl, "failed to parse class message");
3649 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3650 *handlep = tc->tcm_handle;
3654 *options = ta[TCA_OPTIONS];
3658 const struct gnet_stats_queue *gsq;
3659 struct gnet_stats_basic gsb;
3661 static const struct nl_policy stats_policy[] = {
3662 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3663 .min_len = sizeof gsb },
3664 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3665 .min_len = sizeof *gsq },
3667 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3669 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3670 sa, ARRAY_SIZE(sa))) {
3671 VLOG_WARN_RL(&rl, "failed to parse class stats");
3675 /* Alignment issues screw up the length of struct gnet_stats_basic on
3676 * some arch/bitsize combinations. Newer versions of Linux have a
3677 * struct gnet_stats_basic_packed, but we can't depend on that. The
3678 * easiest thing to do is just to make a copy. */
3679 memset(&gsb, 0, sizeof gsb);
3680 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3681 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3682 stats->tx_bytes = gsb.bytes;
3683 stats->tx_packets = gsb.packets;
3685 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3686 stats->tx_errors = gsq->drops;
3696 memset(stats, 0, sizeof *stats);
3701 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3704 tc_query_class(const struct netdev *netdev,
3705 unsigned int handle, unsigned int parent,
3706 struct ofpbuf **replyp)
3708 struct ofpbuf request;
3709 struct tcmsg *tcmsg;
3712 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3716 tcmsg->tcm_handle = handle;
3717 tcmsg->tcm_parent = parent;
3719 error = tc_transact(&request, replyp);
3721 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3722 netdev_get_name(netdev),
3723 tc_get_major(handle), tc_get_minor(handle),
3724 tc_get_major(parent), tc_get_minor(parent),
3730 /* Equivalent to "tc class del dev <name> handle <handle>". */
3732 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3734 struct ofpbuf request;
3735 struct tcmsg *tcmsg;
3738 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3742 tcmsg->tcm_handle = handle;
3743 tcmsg->tcm_parent = 0;
3745 error = tc_transact(&request, NULL);
3747 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3748 netdev_get_name(netdev),
3749 tc_get_major(handle), tc_get_minor(handle),
3755 /* Equivalent to "tc qdisc del dev <name> root". */
3757 tc_del_qdisc(struct netdev *netdev)
3759 struct netdev_dev_linux *netdev_dev =
3760 netdev_dev_linux_cast(netdev_get_dev(netdev));
3761 struct ofpbuf request;
3762 struct tcmsg *tcmsg;
3765 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3769 tcmsg->tcm_handle = tc_make_handle(1, 0);
3770 tcmsg->tcm_parent = TC_H_ROOT;
3772 error = tc_transact(&request, NULL);
3773 if (error == EINVAL) {
3774 /* EINVAL probably means that the default qdisc was in use, in which
3775 * case we've accomplished our purpose. */
3778 if (!error && netdev_dev->tc) {
3779 if (netdev_dev->tc->ops->tc_destroy) {
3780 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3782 netdev_dev->tc = NULL;
3787 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3788 * kernel to determine what they are. Returns 0 if successful, otherwise a
3789 * positive errno value. */
3791 tc_query_qdisc(const struct netdev *netdev)
3793 struct netdev_dev_linux *netdev_dev =
3794 netdev_dev_linux_cast(netdev_get_dev(netdev));
3795 struct ofpbuf request, *qdisc;
3796 const struct tc_ops *ops;
3797 struct tcmsg *tcmsg;
3801 if (netdev_dev->tc) {
3805 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3806 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3807 * 2.6.35 without that fix backported to it.
3809 * To avoid the OOPS, we must not make a request that would attempt to dump
3810 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3811 * few others. There are a few ways that I can see to do this, but most of
3812 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3813 * technique chosen here is to assume that any non-default qdisc that we
3814 * create will have a class with handle 1:0. The built-in qdiscs only have
3815 * a class with handle 0:0.
3817 * We could check for Linux 2.6.35+ and use a more straightforward method
3819 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3823 tcmsg->tcm_handle = tc_make_handle(1, 0);
3824 tcmsg->tcm_parent = 0;
3826 /* Figure out what tc class to instantiate. */
3827 error = tc_transact(&request, &qdisc);
3831 error = tc_parse_qdisc(qdisc, &kind, NULL);
3833 ops = &tc_ops_other;
3835 ops = tc_lookup_linux_name(kind);
3837 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3838 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3840 ops = &tc_ops_other;
3843 } else if (error == ENOENT) {
3844 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3845 * other entity that doesn't have a handle 1:0. We will assume
3846 * that it's the system default qdisc. */
3847 ops = &tc_ops_default;
3850 /* Who knows? Maybe the device got deleted. */
3851 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3852 netdev_get_name(netdev), strerror(error));
3853 ops = &tc_ops_other;
3856 /* Instantiate it. */
3857 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3858 assert((load_error == 0) == (netdev_dev->tc != NULL));
3859 ofpbuf_delete(qdisc);
3861 return error ? error : load_error;
3864 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3865 approximate the time to transmit packets of various lengths. For an MTU of
3866 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3867 represents two possible packet lengths; for a MTU of 513 through 1024, four
3868 possible lengths; and so on.
3870 Returns, for the specified 'mtu', the number of bits that packet lengths
3871 need to be shifted right to fit within such a 256-entry table. */
3873 tc_calc_cell_log(unsigned int mtu)
3878 mtu = ETH_PAYLOAD_MAX;
3880 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3882 for (cell_log = 0; mtu >= 256; cell_log++) {
3889 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3892 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3894 memset(rate, 0, sizeof *rate);
3895 rate->cell_log = tc_calc_cell_log(mtu);
3896 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3897 /* rate->cell_align = 0; */ /* distro headers. */
3898 rate->mpu = ETH_TOTAL_MIN;
3902 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3903 * attribute of the specified "type".
3905 * See tc_calc_cell_log() above for a description of "rtab"s. */
3907 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3912 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3913 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3914 unsigned packet_size = (i + 1) << rate->cell_log;
3915 if (packet_size < rate->mpu) {
3916 packet_size = rate->mpu;
3918 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3922 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3923 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3924 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3927 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3929 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3930 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3933 /* Linux-only functions declared in netdev-linux.h */
3935 /* Returns a fd for an AF_INET socket or a negative errno value. */
3937 netdev_linux_get_af_inet_sock(void)
3939 int error = netdev_linux_init();
3940 return error ? -error : af_inet_sock;
3943 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
3944 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
3946 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
3947 const char *flag_name, bool enable)
3949 const char *netdev_name = netdev_get_name(netdev);
3950 struct ethtool_value evalue;
3954 memset(&evalue, 0, sizeof evalue);
3955 error = netdev_linux_do_ethtool(netdev_name,
3956 (struct ethtool_cmd *)&evalue,
3957 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3962 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
3963 error = netdev_linux_do_ethtool(netdev_name,
3964 (struct ethtool_cmd *)&evalue,
3965 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
3970 memset(&evalue, 0, sizeof evalue);
3971 error = netdev_linux_do_ethtool(netdev_name,
3972 (struct ethtool_cmd *)&evalue,
3973 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3978 if (new_flags != evalue.data) {
3979 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
3980 "device %s failed", enable ? "enable" : "disable",
3981 flag_name, netdev_name);
3988 /* Utility functions. */
3990 /* Copies 'src' into 'dst', performing format conversion in the process. */
3992 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3993 const struct rtnl_link_stats *src)
3995 dst->rx_packets = src->rx_packets;
3996 dst->tx_packets = src->tx_packets;
3997 dst->rx_bytes = src->rx_bytes;
3998 dst->tx_bytes = src->tx_bytes;
3999 dst->rx_errors = src->rx_errors;
4000 dst->tx_errors = src->tx_errors;
4001 dst->rx_dropped = src->rx_dropped;
4002 dst->tx_dropped = src->tx_dropped;
4003 dst->multicast = src->multicast;
4004 dst->collisions = src->collisions;
4005 dst->rx_length_errors = src->rx_length_errors;
4006 dst->rx_over_errors = src->rx_over_errors;
4007 dst->rx_crc_errors = src->rx_crc_errors;
4008 dst->rx_frame_errors = src->rx_frame_errors;
4009 dst->rx_fifo_errors = src->rx_fifo_errors;
4010 dst->rx_missed_errors = src->rx_missed_errors;
4011 dst->tx_aborted_errors = src->tx_aborted_errors;
4012 dst->tx_carrier_errors = src->tx_carrier_errors;
4013 dst->tx_fifo_errors = src->tx_fifo_errors;
4014 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4015 dst->tx_window_errors = src->tx_window_errors;
4019 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4021 /* Policy for RTNLGRP_LINK messages.
4023 * There are *many* more fields in these messages, but currently we only
4024 * care about these fields. */
4025 static const struct nl_policy rtnlgrp_link_policy[] = {
4026 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4027 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4028 .min_len = sizeof(struct rtnl_link_stats) },
4031 struct ofpbuf request;
4032 struct ofpbuf *reply;
4033 struct ifinfomsg *ifi;
4034 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4037 ofpbuf_init(&request, 0);
4038 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4039 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4040 ifi->ifi_family = PF_UNSPEC;
4041 ifi->ifi_index = ifindex;
4042 error = nl_sock_transact(rtnl_sock, &request, &reply);
4043 ofpbuf_uninit(&request);
4048 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4049 rtnlgrp_link_policy,
4050 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4051 ofpbuf_delete(reply);
4055 if (!attrs[IFLA_STATS]) {
4056 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4057 ofpbuf_delete(reply);
4061 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4063 ofpbuf_delete(reply);
4069 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4071 static const char fn[] = "/proc/net/dev";
4076 stream = fopen(fn, "r");
4078 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4083 while (fgets(line, sizeof line, stream)) {
4086 #define X64 "%"SCNu64
4089 X64 X64 X64 X64 X64 X64 X64 "%*u"
4090 X64 X64 X64 X64 X64 X64 X64 "%*u",
4096 &stats->rx_fifo_errors,
4097 &stats->rx_frame_errors,
4103 &stats->tx_fifo_errors,
4105 &stats->tx_carrier_errors) != 15) {
4106 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4107 } else if (!strcmp(devname, netdev_name)) {
4108 stats->rx_length_errors = UINT64_MAX;
4109 stats->rx_over_errors = UINT64_MAX;
4110 stats->rx_crc_errors = UINT64_MAX;
4111 stats->rx_missed_errors = UINT64_MAX;
4112 stats->tx_aborted_errors = UINT64_MAX;
4113 stats->tx_heartbeat_errors = UINT64_MAX;
4114 stats->tx_window_errors = UINT64_MAX;
4120 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4126 get_carrier_via_sysfs(const char *name, bool *carrier)
4137 fn = xasprintf("/sys/class/net/%s/carrier", name);
4138 fd = open(fn, O_RDONLY);
4141 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4145 retval = read(fd, line, sizeof line);
4148 if (error == EINVAL) {
4149 /* This is the normal return value when we try to check carrier if
4150 * the network device is not up. */
4152 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4155 } else if (retval == 0) {
4157 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4161 if (line[0] != '0' && line[0] != '1') {
4163 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4166 *carrier = line[0] != '0';
4178 get_flags(const struct netdev *netdev, int *flags)
4183 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4185 *flags = ifr.ifr_flags;
4190 set_flags(struct netdev *netdev, int flags)
4194 ifr.ifr_flags = flags;
4195 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4200 do_get_ifindex(const char *netdev_name)
4204 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4205 COVERAGE_INC(netdev_get_ifindex);
4206 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4207 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4208 netdev_name, strerror(errno));
4211 return ifr.ifr_ifindex;
4215 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4217 struct netdev_dev_linux *netdev_dev =
4218 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4220 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4221 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4225 netdev_dev->cache_valid |= VALID_IFINDEX;
4226 netdev_dev->ifindex = ifindex;
4228 *ifindexp = netdev_dev->ifindex;
4233 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4238 memset(&ifr, 0, sizeof ifr);
4239 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4240 COVERAGE_INC(netdev_get_hwaddr);
4241 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4242 /* ENODEV probably means that a vif disappeared asynchronously and
4243 * hasn't been removed from the database yet, so reduce the log level
4244 * to INFO for that case. */
4245 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4246 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4247 netdev_name, strerror(errno));
4250 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4251 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4252 VLOG_WARN("%s device has unknown hardware address family %d",
4253 netdev_name, hwaddr_family);
4255 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4260 set_etheraddr(const char *netdev_name, int hwaddr_family,
4261 const uint8_t mac[ETH_ADDR_LEN])
4265 memset(&ifr, 0, sizeof ifr);
4266 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4267 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4268 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4269 COVERAGE_INC(netdev_set_hwaddr);
4270 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4271 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4272 netdev_name, strerror(errno));
4279 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4280 int cmd, const char *cmd_name)
4284 memset(&ifr, 0, sizeof ifr);
4285 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4286 ifr.ifr_data = (caddr_t) ecmd;
4289 COVERAGE_INC(netdev_ethtool);
4290 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4293 if (errno != EOPNOTSUPP) {
4294 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4295 "failed: %s", cmd_name, name, strerror(errno));
4297 /* The device doesn't support this operation. That's pretty
4298 * common, so there's no point in logging anything. */
4305 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4306 const char *cmd_name)
4308 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4309 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4310 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4318 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4319 int cmd, const char *cmd_name)
4324 ifr.ifr_addr.sa_family = AF_INET;
4325 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4327 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4328 *ip = sin->sin_addr;
4333 /* Returns an AF_PACKET raw socket or a negative errno value. */
4335 af_packet_sock(void)
4337 static int sock = INT_MIN;
4339 if (sock == INT_MIN) {
4340 sock = socket(AF_PACKET, SOCK_RAW, 0);
4342 set_nonblocking(sock);
4345 VLOG_ERR("failed to create packet socket: %s", strerror(errno));