2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_ethtool);
82 /* These were introduced in Linux 2.6.14, so they might be missing if we have
84 #ifndef ADVERTISED_Pause
85 #define ADVERTISED_Pause (1 << 13)
87 #ifndef ADVERTISED_Asym_Pause
88 #define ADVERTISED_Asym_Pause (1 << 14)
91 /* These were introduced in Linux 2.6.24, so they might be missing if we
92 * have old headers. */
93 #ifndef ETHTOOL_GFLAGS
94 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96 #ifndef ETHTOOL_SFLAGS
97 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 #define TC_RTAB_SIZE 1024
106 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
107 static int cache_notifier_refcount;
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_POLICING = 1 << 5,
116 VALID_HAVE_VPORT_STATS = 1 << 6
124 /* Traffic control. */
126 /* An instance of a traffic control class. Always associated with a particular
129 * Each TC implementation subclasses this with whatever additional data it
132 const struct tc_ops *ops;
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
138 /* One traffic control queue.
140 * Each TC implementation subclasses this with whatever additional data it
143 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
144 unsigned int queue_id; /* OpenFlow queue ID. */
147 /* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct shash *details);
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
219 * This function may be null if 'tc' is not configurable.
221 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
230 * This function may be null if 'tc' is not configurable.
232 int (*qdisc_set)(struct netdev *, const struct shash *details);
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
245 * This function may be null if 'tc' does not have queues ('n_queues' is
247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248 struct shash *details);
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct shash *details);
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
274 * On success, initializes '*stats'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
280 struct netdev_queue_stats *stats);
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
293 tc_init(struct tc *tc, const struct tc_ops *ops)
296 hmap_init(&tc->queues);
300 tc_destroy(struct tc *tc)
302 hmap_destroy(&tc->queues);
305 static const struct tc_ops tc_ops_htb;
306 static const struct tc_ops tc_ops_hfsc;
307 static const struct tc_ops tc_ops_default;
308 static const struct tc_ops tc_ops_other;
310 static const struct tc_ops *tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
318 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319 static unsigned int tc_get_major(unsigned int handle);
320 static unsigned int tc_get_minor(unsigned int handle);
322 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326 static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
331 struct nlattr **options);
332 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
333 struct nlattr **options,
334 struct netdev_queue_stats *);
335 static int tc_query_class(const struct netdev *,
336 unsigned int handle, unsigned int parent,
337 struct ofpbuf **replyp);
338 static int tc_delete_class(const struct netdev *, unsigned int handle);
340 static int tc_del_qdisc(struct netdev *netdev);
341 static int tc_query_qdisc(const struct netdev *netdev);
343 static int tc_calc_cell_log(unsigned int mtu);
344 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
345 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
346 const struct tc_ratespec *rate);
347 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
349 struct netdev_dev_linux {
350 struct netdev_dev netdev_dev;
352 struct shash_node *shash_node;
353 unsigned int cache_valid;
354 unsigned int change_seq;
356 bool miimon; /* Link status of last poll. */
357 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
358 struct timer miimon_timer;
360 /* The following are figured out "on demand" only. They are only valid
361 * when the corresponding VALID_* bit in 'cache_valid' is set. */
363 uint8_t etheraddr[ETH_ADDR_LEN];
364 struct in_addr address, netmask;
368 long long int carrier_resets;
369 uint32_t kbits_rate; /* Policing data. */
370 uint32_t kbits_burst;
371 bool have_vport_stats;
375 struct tap_state tap;
379 struct netdev_linux {
380 struct netdev netdev;
384 /* Sockets used for ioctl operations. */
385 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
387 /* A Netlink routing socket that is not subscribed to any multicast groups. */
388 static struct nl_sock *rtnl_sock;
390 /* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394 static int netdev_linux_init(void);
396 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397 int cmd, const char *cmd_name);
398 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
400 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
402 static int get_flags(const struct netdev *, int *flagsp);
403 static int set_flags(struct netdev *, int flags);
404 static int do_get_ifindex(const char *netdev_name);
405 static int get_ifindex(const struct netdev *, int *ifindexp);
406 static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
414 static int get_carrier_via_sysfs(const char *name, bool *carrier);
415 static int af_packet_sock(void);
416 static void netdev_linux_miimon_run(void);
417 static void netdev_linux_miimon_wait(void);
420 is_netdev_linux_class(const struct netdev_class *netdev_class)
422 return netdev_class->init == netdev_linux_init;
425 static struct netdev_dev_linux *
426 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
428 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429 assert(is_netdev_linux_class(netdev_class));
431 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
434 static struct netdev_linux *
435 netdev_linux_cast(const struct netdev *netdev)
437 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
438 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439 assert(is_netdev_linux_class(netdev_class));
441 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
445 netdev_linux_init(void)
447 static int status = -1;
449 /* Create AF_INET socket. */
450 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
451 status = af_inet_sock >= 0 ? 0 : errno;
453 VLOG_ERR("failed to create inet socket: %s", strerror(status));
456 /* Create rtnetlink socket. */
458 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
460 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
469 netdev_linux_run(void)
471 rtnetlink_link_run();
472 netdev_linux_miimon_run();
476 netdev_linux_wait(void)
478 rtnetlink_link_wait();
479 netdev_linux_miimon_wait();
483 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
486 if (!dev->change_seq) {
489 dev->cache_valid = 0;
493 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
494 void *aux OVS_UNUSED)
496 struct netdev_dev_linux *dev;
498 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
500 const struct netdev_class *netdev_class =
501 netdev_dev_get_class(base_dev);
503 if (is_netdev_linux_class(netdev_class)) {
504 dev = netdev_dev_linux_cast(base_dev);
506 if (dev->carrier != change->running) {
507 dev->carrier = change->running;
508 dev->carrier_resets++;
511 netdev_dev_linux_changed(dev);
515 struct shash device_shash;
516 struct shash_node *node;
518 shash_init(&device_shash);
519 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
520 SHASH_FOR_EACH (node, &device_shash) {
525 get_carrier_via_sysfs(node->name, &carrier);
526 if (dev->carrier != carrier) {
527 dev->carrier = carrier;
528 dev->carrier_resets++;
531 netdev_dev_linux_changed(dev);
533 shash_destroy(&device_shash);
538 cache_notifier_ref(void)
540 if (!cache_notifier_refcount) {
541 assert(!netdev_linux_cache_notifier);
543 netdev_linux_cache_notifier =
544 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
546 if (!netdev_linux_cache_notifier) {
550 cache_notifier_refcount++;
556 cache_notifier_unref(void)
558 assert(cache_notifier_refcount > 0);
559 if (!--cache_notifier_refcount) {
560 assert(netdev_linux_cache_notifier);
561 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
562 netdev_linux_cache_notifier = NULL;
566 /* Creates system and internal devices. */
568 netdev_linux_create(const struct netdev_class *class, const char *name,
569 struct netdev_dev **netdev_devp)
571 struct netdev_dev_linux *netdev_dev;
574 error = cache_notifier_ref();
579 netdev_dev = xzalloc(sizeof *netdev_dev);
580 netdev_dev->change_seq = 1;
581 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
582 get_carrier_via_sysfs(name, &netdev_dev->carrier);
584 *netdev_devp = &netdev_dev->netdev_dev;
588 /* For most types of netdevs we open the device for each call of
589 * netdev_open(). However, this is not the case with tap devices,
590 * since it is only possible to open the device once. In this
591 * situation we share a single file descriptor, and consequently
592 * buffers, across all readers. Therefore once data is read it will
593 * be unavailable to other reads for tap devices. */
595 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
596 const char *name, struct netdev_dev **netdev_devp)
598 struct netdev_dev_linux *netdev_dev;
599 struct tap_state *state;
600 static const char tap_dev[] = "/dev/net/tun";
604 netdev_dev = xzalloc(sizeof *netdev_dev);
605 state = &netdev_dev->state.tap;
607 error = cache_notifier_ref();
612 /* Open tap device. */
613 state->fd = open(tap_dev, O_RDWR);
616 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
617 goto error_unref_notifier;
620 /* Create tap device. */
621 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
622 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
623 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
624 VLOG_WARN("%s: creating tap device failed: %s", name,
627 goto error_unref_notifier;
630 /* Make non-blocking. */
631 error = set_nonblocking(state->fd);
633 goto error_unref_notifier;
636 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
637 *netdev_devp = &netdev_dev->netdev_dev;
640 error_unref_notifier:
641 cache_notifier_unref();
648 destroy_tap(struct netdev_dev_linux *netdev_dev)
650 struct tap_state *state = &netdev_dev->state.tap;
652 if (state->fd >= 0) {
657 /* Destroys the netdev device 'netdev_dev_'. */
659 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
661 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
662 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
664 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
665 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
668 if (class == &netdev_tap_class) {
669 destroy_tap(netdev_dev);
673 cache_notifier_unref();
677 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
679 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
680 struct netdev_linux *netdev;
681 enum netdev_flags flags;
684 /* Allocate network device. */
685 netdev = xzalloc(sizeof *netdev);
687 netdev_init(&netdev->netdev, netdev_dev_);
689 /* Verify that the device really exists, by attempting to read its flags.
690 * (The flags might be cached, in which case this won't actually do an
693 * Don't do this for "internal" netdevs, though, because those have to be
694 * created as netdev objects before they exist in the kernel, because
695 * creating them in the kernel happens by passing a netdev object to
696 * dpif_port_add(). */
697 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
698 error = netdev_get_flags(&netdev->netdev, &flags);
699 if (error == ENODEV) {
704 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
705 !netdev_dev->state.tap.opened) {
707 /* We assume that the first user of the tap device is the primary user
708 * and give them the tap FD. Subsequent users probably just expect
709 * this to be a system device so open it normally to avoid send/receive
710 * directions appearing to be reversed. */
711 netdev->fd = netdev_dev->state.tap.fd;
712 netdev_dev->state.tap.opened = true;
715 *netdevp = &netdev->netdev;
719 netdev_uninit(&netdev->netdev, true);
723 /* Closes and destroys 'netdev'. */
725 netdev_linux_close(struct netdev *netdev_)
727 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
729 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
736 netdev_linux_listen(struct netdev *netdev_)
738 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
739 struct sockaddr_ll sll;
744 if (netdev->fd >= 0) {
748 /* Create file descriptor. */
749 fd = socket(PF_PACKET, SOCK_RAW, 0);
752 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
756 /* Set non-blocking mode. */
757 error = set_nonblocking(fd);
762 /* Get ethernet device index. */
763 error = get_ifindex(&netdev->netdev, &ifindex);
768 /* Bind to specific ethernet device. */
769 memset(&sll, 0, sizeof sll);
770 sll.sll_family = AF_PACKET;
771 sll.sll_ifindex = ifindex;
772 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
773 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
775 VLOG_ERR("%s: failed to bind raw socket (%s)",
776 netdev_get_name(netdev_), strerror(error));
791 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
795 if (netdev->fd < 0) {
796 /* Device is not listening. */
801 ssize_t retval = read(netdev->fd, data, size);
804 } else if (errno != EINTR) {
805 if (errno != EAGAIN) {
806 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
807 strerror(errno), netdev_get_name(netdev_));
814 /* Registers with the poll loop to wake up from the next call to poll_block()
815 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
817 netdev_linux_recv_wait(struct netdev *netdev_)
819 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
820 if (netdev->fd >= 0) {
821 poll_fd_wait(netdev->fd, POLLIN);
825 /* Discards all packets waiting to be received from 'netdev'. */
827 netdev_linux_drain(struct netdev *netdev_)
829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
830 if (netdev->fd < 0) {
832 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
834 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
835 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
839 drain_fd(netdev->fd, ifr.ifr_qlen);
842 return drain_rcvbuf(netdev->fd);
846 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
847 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
848 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
849 * the packet is too big or too small to transmit on the device.
851 * The caller retains ownership of 'buffer' in all cases.
853 * The kernel maintains a packet transmission queue, so the caller is not
854 * expected to do additional queuing of packets. */
856 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
858 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
862 if (netdev->fd < 0) {
863 /* Use our AF_PACKET socket to send to this device. */
864 struct sockaddr_ll sll;
871 sock = af_packet_sock();
876 error = get_ifindex(netdev_, &ifindex);
881 /* We don't bother setting most fields in sockaddr_ll because the
882 * kernel ignores them for SOCK_RAW. */
883 memset(&sll, 0, sizeof sll);
884 sll.sll_family = AF_PACKET;
885 sll.sll_ifindex = ifindex;
887 iov.iov_base = (void *) data;
891 msg.msg_namelen = sizeof sll;
894 msg.msg_control = NULL;
895 msg.msg_controllen = 0;
898 retval = sendmsg(sock, &msg, 0);
900 /* Use the netdev's own fd to send to this device. This is
901 * essential for tap devices, because packets sent to a tap device
902 * with an AF_PACKET socket will loop back to be *received* again
903 * on the tap device. */
904 retval = write(netdev->fd, data, size);
908 /* The Linux AF_PACKET implementation never blocks waiting for room
909 * for packets, instead returning ENOBUFS. Translate this into
910 * EAGAIN for the caller. */
911 if (errno == ENOBUFS) {
913 } else if (errno == EINTR) {
915 } else if (errno != EAGAIN) {
916 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
917 netdev_get_name(netdev_), strerror(errno));
920 } else if (retval != size) {
921 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
922 "%zu) on %s", retval, size, netdev_get_name(netdev_));
930 /* Registers with the poll loop to wake up from the next call to poll_block()
931 * when the packet transmission queue has sufficient room to transmit a packet
932 * with netdev_send().
934 * The kernel maintains a packet transmission queue, so the client is not
935 * expected to do additional queuing of packets. Thus, this function is
936 * unlikely to ever be used. It is included for completeness. */
938 netdev_linux_send_wait(struct netdev *netdev_)
940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
941 if (netdev->fd < 0) {
943 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
944 poll_fd_wait(netdev->fd, POLLOUT);
946 /* TAP device always accepts packets.*/
947 poll_immediate_wake();
951 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
952 * otherwise a positive errno value. */
954 netdev_linux_set_etheraddr(struct netdev *netdev_,
955 const uint8_t mac[ETH_ADDR_LEN])
957 struct netdev_dev_linux *netdev_dev =
958 netdev_dev_linux_cast(netdev_get_dev(netdev_));
961 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
962 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
963 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
965 netdev_dev->cache_valid |= VALID_ETHERADDR;
966 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
974 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
975 * free the returned buffer. */
977 netdev_linux_get_etheraddr(const struct netdev *netdev_,
978 uint8_t mac[ETH_ADDR_LEN])
980 struct netdev_dev_linux *netdev_dev =
981 netdev_dev_linux_cast(netdev_get_dev(netdev_));
982 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
983 int error = get_etheraddr(netdev_get_name(netdev_),
984 netdev_dev->etheraddr);
988 netdev_dev->cache_valid |= VALID_ETHERADDR;
990 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
994 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
995 * in bytes, not including the hardware header; thus, this is typically 1500
996 * bytes for Ethernet devices. */
998 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1000 struct netdev_dev_linux *netdev_dev =
1001 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1002 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1006 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1007 SIOCGIFMTU, "SIOCGIFMTU");
1011 netdev_dev->mtu = ifr.ifr_mtu;
1012 netdev_dev->cache_valid |= VALID_MTU;
1014 *mtup = netdev_dev->mtu;
1018 /* Sets the maximum size of transmitted (MTU) for given device using linux
1019 * networking ioctl interface.
1022 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1024 struct netdev_dev_linux *netdev_dev =
1025 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1030 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1031 SIOCSIFMTU, "SIOCSIFMTU");
1036 netdev_dev->mtu = ifr.ifr_mtu;
1037 netdev_dev->cache_valid |= VALID_MTU;
1041 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1042 * On failure, returns a negative errno value. */
1044 netdev_linux_get_ifindex(const struct netdev *netdev)
1048 error = get_ifindex(netdev, &ifindex);
1049 return error ? -error : ifindex;
1053 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1055 struct netdev_dev_linux *netdev_dev =
1056 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1058 if (netdev_dev->miimon_interval > 0) {
1059 *carrier = netdev_dev->miimon;
1061 *carrier = netdev_dev->carrier;
1067 static long long int
1068 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1070 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1074 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1075 struct mii_ioctl_data *data)
1080 memset(&ifr, 0, sizeof ifr);
1081 memcpy(&ifr.ifr_data, data, sizeof *data);
1082 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1083 memcpy(data, &ifr.ifr_data, sizeof *data);
1089 netdev_linux_get_miimon(const char *name, bool *miimon)
1091 struct mii_ioctl_data data;
1096 memset(&data, 0, sizeof data);
1097 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1099 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1100 data.reg_num = MII_BMSR;
1101 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1105 *miimon = !!(data.val_out & BMSR_LSTATUS);
1107 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1110 struct ethtool_cmd ecmd;
1112 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1115 memset(&ecmd, 0, sizeof ecmd);
1116 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1119 struct ethtool_value eval;
1121 memcpy(&eval, &ecmd, sizeof eval);
1122 *miimon = !!eval.data;
1124 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1132 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1133 long long int interval)
1135 struct netdev_dev_linux *netdev_dev;
1137 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1139 interval = interval > 0 ? MAX(interval, 100) : 0;
1140 if (netdev_dev->miimon_interval != interval) {
1141 netdev_dev->miimon_interval = interval;
1142 timer_set_expired(&netdev_dev->miimon_timer);
1149 netdev_linux_miimon_run(void)
1151 struct shash device_shash;
1152 struct shash_node *node;
1154 shash_init(&device_shash);
1155 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1156 SHASH_FOR_EACH (node, &device_shash) {
1157 struct netdev_dev_linux *dev = node->data;
1160 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1164 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1165 if (miimon != dev->miimon) {
1166 dev->miimon = miimon;
1167 netdev_dev_linux_changed(dev);
1170 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1173 shash_destroy(&device_shash);
1177 netdev_linux_miimon_wait(void)
1179 struct shash device_shash;
1180 struct shash_node *node;
1182 shash_init(&device_shash);
1183 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1184 SHASH_FOR_EACH (node, &device_shash) {
1185 struct netdev_dev_linux *dev = node->data;
1187 if (dev->miimon_interval > 0) {
1188 timer_wait(&dev->miimon_timer);
1191 shash_destroy(&device_shash);
1194 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1195 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1198 check_for_working_netlink_stats(void)
1200 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1201 * preferable, so if that works, we'll use it. */
1202 int ifindex = do_get_ifindex("lo");
1204 VLOG_WARN("failed to get ifindex for lo, "
1205 "obtaining netdev stats from proc");
1208 struct netdev_stats stats;
1209 int error = get_stats_via_netlink(ifindex, &stats);
1211 VLOG_DBG("obtaining netdev stats via rtnetlink");
1214 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1215 "via proc (you are probably running a pre-2.6.19 "
1216 "kernel)", strerror(error));
1223 swap_uint64(uint64_t *a, uint64_t *b)
1231 get_stats_via_vport(const struct netdev *netdev_,
1232 struct netdev_stats *stats)
1234 struct netdev_dev_linux *netdev_dev =
1235 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1237 if (netdev_dev->have_vport_stats ||
1238 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1241 error = netdev_vport_get_stats(netdev_, stats);
1243 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1244 netdev_get_name(netdev_), error);
1246 netdev_dev->have_vport_stats = !error;
1247 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1252 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1253 struct netdev_stats *stats)
1255 static int use_netlink_stats = -1;
1258 if (use_netlink_stats < 0) {
1259 use_netlink_stats = check_for_working_netlink_stats();
1262 if (use_netlink_stats) {
1265 error = get_ifindex(netdev_, &ifindex);
1267 error = get_stats_via_netlink(ifindex, stats);
1270 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1274 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1275 netdev_get_name(netdev_), error);
1281 /* Retrieves current device stats for 'netdev-linux'. */
1283 netdev_linux_get_stats(const struct netdev *netdev_,
1284 struct netdev_stats *stats)
1286 struct netdev_dev_linux *netdev_dev =
1287 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1288 struct netdev_stats dev_stats;
1291 get_stats_via_vport(netdev_, stats);
1293 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1296 if (!netdev_dev->have_vport_stats) {
1303 if (!netdev_dev->have_vport_stats) {
1304 /* stats not available from OVS then use ioctl stats. */
1307 stats->rx_errors += dev_stats.rx_errors;
1308 stats->tx_errors += dev_stats.tx_errors;
1309 stats->rx_dropped += dev_stats.rx_dropped;
1310 stats->tx_dropped += dev_stats.tx_dropped;
1311 stats->multicast += dev_stats.multicast;
1312 stats->collisions += dev_stats.collisions;
1313 stats->rx_length_errors += dev_stats.rx_length_errors;
1314 stats->rx_over_errors += dev_stats.rx_over_errors;
1315 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1316 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1317 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1318 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1319 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1320 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1321 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1322 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1323 stats->tx_window_errors += dev_stats.tx_window_errors;
1328 /* Retrieves current device stats for 'netdev-tap' netdev or
1329 * netdev-internal. */
1331 netdev_pseudo_get_stats(const struct netdev *netdev_,
1332 struct netdev_stats *stats)
1334 struct netdev_dev_linux *netdev_dev =
1335 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1336 struct netdev_stats dev_stats;
1339 get_stats_via_vport(netdev_, stats);
1341 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1343 if (!netdev_dev->have_vport_stats) {
1350 /* If this port is an internal port then the transmit and receive stats
1351 * will appear to be swapped relative to the other ports since we are the
1352 * one sending the data, not a remote computer. For consistency, we swap
1353 * them back here. This does not apply if we are getting stats from the
1354 * vport layer because it always tracks stats from the perspective of the
1356 if (!netdev_dev->have_vport_stats) {
1358 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1359 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1360 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1361 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1362 stats->rx_length_errors = 0;
1363 stats->rx_over_errors = 0;
1364 stats->rx_crc_errors = 0;
1365 stats->rx_frame_errors = 0;
1366 stats->rx_fifo_errors = 0;
1367 stats->rx_missed_errors = 0;
1368 stats->tx_aborted_errors = 0;
1369 stats->tx_carrier_errors = 0;
1370 stats->tx_fifo_errors = 0;
1371 stats->tx_heartbeat_errors = 0;
1372 stats->tx_window_errors = 0;
1374 stats->rx_dropped += dev_stats.tx_dropped;
1375 stats->tx_dropped += dev_stats.rx_dropped;
1377 stats->rx_errors += dev_stats.tx_errors;
1378 stats->tx_errors += dev_stats.rx_errors;
1380 stats->multicast += dev_stats.multicast;
1381 stats->collisions += dev_stats.collisions;
1386 /* Stores the features supported by 'netdev' into each of '*current',
1387 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1388 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1389 * successful, otherwise a positive errno value. */
1391 netdev_linux_get_features(const struct netdev *netdev,
1392 uint32_t *current, uint32_t *advertised,
1393 uint32_t *supported, uint32_t *peer)
1395 struct ethtool_cmd ecmd;
1398 memset(&ecmd, 0, sizeof ecmd);
1399 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1400 ETHTOOL_GSET, "ETHTOOL_GSET");
1405 /* Supported features. */
1407 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1408 *supported |= OFPPF_10MB_HD;
1410 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1411 *supported |= OFPPF_10MB_FD;
1413 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1414 *supported |= OFPPF_100MB_HD;
1416 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1417 *supported |= OFPPF_100MB_FD;
1419 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1420 *supported |= OFPPF_1GB_HD;
1422 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1423 *supported |= OFPPF_1GB_FD;
1425 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1426 *supported |= OFPPF_10GB_FD;
1428 if (ecmd.supported & SUPPORTED_TP) {
1429 *supported |= OFPPF_COPPER;
1431 if (ecmd.supported & SUPPORTED_FIBRE) {
1432 *supported |= OFPPF_FIBER;
1434 if (ecmd.supported & SUPPORTED_Autoneg) {
1435 *supported |= OFPPF_AUTONEG;
1437 if (ecmd.supported & SUPPORTED_Pause) {
1438 *supported |= OFPPF_PAUSE;
1440 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1441 *supported |= OFPPF_PAUSE_ASYM;
1444 /* Advertised features. */
1446 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1447 *advertised |= OFPPF_10MB_HD;
1449 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1450 *advertised |= OFPPF_10MB_FD;
1452 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1453 *advertised |= OFPPF_100MB_HD;
1455 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1456 *advertised |= OFPPF_100MB_FD;
1458 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1459 *advertised |= OFPPF_1GB_HD;
1461 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1462 *advertised |= OFPPF_1GB_FD;
1464 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1465 *advertised |= OFPPF_10GB_FD;
1467 if (ecmd.advertising & ADVERTISED_TP) {
1468 *advertised |= OFPPF_COPPER;
1470 if (ecmd.advertising & ADVERTISED_FIBRE) {
1471 *advertised |= OFPPF_FIBER;
1473 if (ecmd.advertising & ADVERTISED_Autoneg) {
1474 *advertised |= OFPPF_AUTONEG;
1476 if (ecmd.advertising & ADVERTISED_Pause) {
1477 *advertised |= OFPPF_PAUSE;
1479 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1480 *advertised |= OFPPF_PAUSE_ASYM;
1483 /* Current settings. */
1484 if (ecmd.speed == SPEED_10) {
1485 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1486 } else if (ecmd.speed == SPEED_100) {
1487 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1488 } else if (ecmd.speed == SPEED_1000) {
1489 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1490 } else if (ecmd.speed == SPEED_10000) {
1491 *current = OFPPF_10GB_FD;
1496 if (ecmd.port == PORT_TP) {
1497 *current |= OFPPF_COPPER;
1498 } else if (ecmd.port == PORT_FIBRE) {
1499 *current |= OFPPF_FIBER;
1503 *current |= OFPPF_AUTONEG;
1506 /* Peer advertisements. */
1507 *peer = 0; /* XXX */
1512 /* Set the features advertised by 'netdev' to 'advertise'. */
1514 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1516 struct ethtool_cmd ecmd;
1519 memset(&ecmd, 0, sizeof ecmd);
1520 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1521 ETHTOOL_GSET, "ETHTOOL_GSET");
1526 ecmd.advertising = 0;
1527 if (advertise & OFPPF_10MB_HD) {
1528 ecmd.advertising |= ADVERTISED_10baseT_Half;
1530 if (advertise & OFPPF_10MB_FD) {
1531 ecmd.advertising |= ADVERTISED_10baseT_Full;
1533 if (advertise & OFPPF_100MB_HD) {
1534 ecmd.advertising |= ADVERTISED_100baseT_Half;
1536 if (advertise & OFPPF_100MB_FD) {
1537 ecmd.advertising |= ADVERTISED_100baseT_Full;
1539 if (advertise & OFPPF_1GB_HD) {
1540 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1542 if (advertise & OFPPF_1GB_FD) {
1543 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1545 if (advertise & OFPPF_10GB_FD) {
1546 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1548 if (advertise & OFPPF_COPPER) {
1549 ecmd.advertising |= ADVERTISED_TP;
1551 if (advertise & OFPPF_FIBER) {
1552 ecmd.advertising |= ADVERTISED_FIBRE;
1554 if (advertise & OFPPF_AUTONEG) {
1555 ecmd.advertising |= ADVERTISED_Autoneg;
1557 if (advertise & OFPPF_PAUSE) {
1558 ecmd.advertising |= ADVERTISED_Pause;
1560 if (advertise & OFPPF_PAUSE_ASYM) {
1561 ecmd.advertising |= ADVERTISED_Asym_Pause;
1563 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1564 ETHTOOL_SSET, "ETHTOOL_SSET");
1567 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1568 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1570 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1571 * positive errno value.
1573 * This function is equivalent to running
1574 * /sbin/tc qdisc del dev %s handle ffff: ingress
1575 * but it is much, much faster.
1578 netdev_linux_remove_policing(struct netdev *netdev)
1580 struct netdev_dev_linux *netdev_dev =
1581 netdev_dev_linux_cast(netdev_get_dev(netdev));
1582 const char *netdev_name = netdev_get_name(netdev);
1584 struct ofpbuf request;
1585 struct tcmsg *tcmsg;
1588 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1592 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1593 tcmsg->tcm_parent = TC_H_INGRESS;
1594 nl_msg_put_string(&request, TCA_KIND, "ingress");
1595 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1597 error = tc_transact(&request, NULL);
1598 if (error && error != ENOENT && error != EINVAL) {
1599 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1600 netdev_name, strerror(error));
1604 netdev_dev->kbits_rate = 0;
1605 netdev_dev->kbits_burst = 0;
1606 netdev_dev->cache_valid |= VALID_POLICING;
1610 /* Attempts to set input rate limiting (policing) policy. */
1612 netdev_linux_set_policing(struct netdev *netdev,
1613 uint32_t kbits_rate, uint32_t kbits_burst)
1615 struct netdev_dev_linux *netdev_dev =
1616 netdev_dev_linux_cast(netdev_get_dev(netdev));
1617 const char *netdev_name = netdev_get_name(netdev);
1620 COVERAGE_INC(netdev_set_policing);
1622 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1623 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1624 : kbits_burst); /* Stick with user-specified value. */
1626 if (netdev_dev->cache_valid & VALID_POLICING
1627 && netdev_dev->kbits_rate == kbits_rate
1628 && netdev_dev->kbits_burst == kbits_burst) {
1629 /* Assume that settings haven't changed since we last set them. */
1633 netdev_linux_remove_policing(netdev);
1635 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1636 if (system(command) != 0) {
1637 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1641 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1642 kbits_rate, kbits_burst);
1643 if (system(command) != 0) {
1644 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1649 netdev_dev->kbits_rate = kbits_rate;
1650 netdev_dev->kbits_burst = kbits_burst;
1651 netdev_dev->cache_valid |= VALID_POLICING;
1658 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1661 const struct tc_ops **opsp;
1663 for (opsp = tcs; *opsp != NULL; opsp++) {
1664 const struct tc_ops *ops = *opsp;
1665 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1666 sset_add(types, ops->ovs_name);
1672 static const struct tc_ops *
1673 tc_lookup_ovs_name(const char *name)
1675 const struct tc_ops **opsp;
1677 for (opsp = tcs; *opsp != NULL; opsp++) {
1678 const struct tc_ops *ops = *opsp;
1679 if (!strcmp(name, ops->ovs_name)) {
1686 static const struct tc_ops *
1687 tc_lookup_linux_name(const char *name)
1689 const struct tc_ops **opsp;
1691 for (opsp = tcs; *opsp != NULL; opsp++) {
1692 const struct tc_ops *ops = *opsp;
1693 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1700 static struct tc_queue *
1701 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1704 struct netdev_dev_linux *netdev_dev =
1705 netdev_dev_linux_cast(netdev_get_dev(netdev));
1706 struct tc_queue *queue;
1708 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1709 if (queue->queue_id == queue_id) {
1716 static struct tc_queue *
1717 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1719 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1723 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1725 struct netdev_qos_capabilities *caps)
1727 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1731 caps->n_queues = ops->n_queues;
1736 netdev_linux_get_qos(const struct netdev *netdev,
1737 const char **typep, struct shash *details)
1739 struct netdev_dev_linux *netdev_dev =
1740 netdev_dev_linux_cast(netdev_get_dev(netdev));
1743 error = tc_query_qdisc(netdev);
1748 *typep = netdev_dev->tc->ops->ovs_name;
1749 return (netdev_dev->tc->ops->qdisc_get
1750 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1755 netdev_linux_set_qos(struct netdev *netdev,
1756 const char *type, const struct shash *details)
1758 struct netdev_dev_linux *netdev_dev =
1759 netdev_dev_linux_cast(netdev_get_dev(netdev));
1760 const struct tc_ops *new_ops;
1763 new_ops = tc_lookup_ovs_name(type);
1764 if (!new_ops || !new_ops->tc_install) {
1768 error = tc_query_qdisc(netdev);
1773 if (new_ops == netdev_dev->tc->ops) {
1774 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1776 /* Delete existing qdisc. */
1777 error = tc_del_qdisc(netdev);
1781 assert(netdev_dev->tc == NULL);
1783 /* Install new qdisc. */
1784 error = new_ops->tc_install(netdev, details);
1785 assert((error == 0) == (netdev_dev->tc != NULL));
1792 netdev_linux_get_queue(const struct netdev *netdev,
1793 unsigned int queue_id, struct shash *details)
1795 struct netdev_dev_linux *netdev_dev =
1796 netdev_dev_linux_cast(netdev_get_dev(netdev));
1799 error = tc_query_qdisc(netdev);
1803 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1805 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1811 netdev_linux_set_queue(struct netdev *netdev,
1812 unsigned int queue_id, const struct shash *details)
1814 struct netdev_dev_linux *netdev_dev =
1815 netdev_dev_linux_cast(netdev_get_dev(netdev));
1818 error = tc_query_qdisc(netdev);
1821 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1822 || !netdev_dev->tc->ops->class_set) {
1826 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1830 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1832 struct netdev_dev_linux *netdev_dev =
1833 netdev_dev_linux_cast(netdev_get_dev(netdev));
1836 error = tc_query_qdisc(netdev);
1839 } else if (!netdev_dev->tc->ops->class_delete) {
1842 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1844 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1850 netdev_linux_get_queue_stats(const struct netdev *netdev,
1851 unsigned int queue_id,
1852 struct netdev_queue_stats *stats)
1854 struct netdev_dev_linux *netdev_dev =
1855 netdev_dev_linux_cast(netdev_get_dev(netdev));
1858 error = tc_query_qdisc(netdev);
1861 } else if (!netdev_dev->tc->ops->class_get_stats) {
1864 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1866 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1872 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1874 struct ofpbuf request;
1875 struct tcmsg *tcmsg;
1877 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1881 tcmsg->tcm_parent = 0;
1882 nl_dump_start(dump, rtnl_sock, &request);
1883 ofpbuf_uninit(&request);
1888 netdev_linux_dump_queues(const struct netdev *netdev,
1889 netdev_dump_queues_cb *cb, void *aux)
1891 struct netdev_dev_linux *netdev_dev =
1892 netdev_dev_linux_cast(netdev_get_dev(netdev));
1893 struct tc_queue *queue;
1894 struct shash details;
1898 error = tc_query_qdisc(netdev);
1901 } else if (!netdev_dev->tc->ops->class_get) {
1906 shash_init(&details);
1907 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1908 shash_clear(&details);
1910 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1912 (*cb)(queue->queue_id, &details, aux);
1917 shash_destroy(&details);
1923 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1924 netdev_dump_queue_stats_cb *cb, void *aux)
1926 struct netdev_dev_linux *netdev_dev =
1927 netdev_dev_linux_cast(netdev_get_dev(netdev));
1928 struct nl_dump dump;
1933 error = tc_query_qdisc(netdev);
1936 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1941 if (!start_queue_dump(netdev, &dump)) {
1944 while (nl_dump_next(&dump, &msg)) {
1945 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1951 error = nl_dump_done(&dump);
1952 return error ? error : last_error;
1956 netdev_linux_get_in4(const struct netdev *netdev_,
1957 struct in_addr *address, struct in_addr *netmask)
1959 struct netdev_dev_linux *netdev_dev =
1960 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1962 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1965 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1966 SIOCGIFADDR, "SIOCGIFADDR");
1971 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1972 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1977 netdev_dev->cache_valid |= VALID_IN4;
1979 *address = netdev_dev->address;
1980 *netmask = netdev_dev->netmask;
1981 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1985 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1986 struct in_addr netmask)
1988 struct netdev_dev_linux *netdev_dev =
1989 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1992 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1994 netdev_dev->cache_valid |= VALID_IN4;
1995 netdev_dev->address = address;
1996 netdev_dev->netmask = netmask;
1997 if (address.s_addr != INADDR_ANY) {
1998 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1999 "SIOCSIFNETMASK", netmask);
2006 parse_if_inet6_line(const char *line,
2007 struct in6_addr *in6, char ifname[16 + 1])
2009 uint8_t *s6 = in6->s6_addr;
2010 #define X8 "%2"SCNx8
2012 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2013 "%*x %*x %*x %*x %16s\n",
2014 &s6[0], &s6[1], &s6[2], &s6[3],
2015 &s6[4], &s6[5], &s6[6], &s6[7],
2016 &s6[8], &s6[9], &s6[10], &s6[11],
2017 &s6[12], &s6[13], &s6[14], &s6[15],
2021 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2022 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2024 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2026 struct netdev_dev_linux *netdev_dev =
2027 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2028 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2032 netdev_dev->in6 = in6addr_any;
2034 file = fopen("/proc/net/if_inet6", "r");
2036 const char *name = netdev_get_name(netdev_);
2037 while (fgets(line, sizeof line, file)) {
2038 struct in6_addr in6_tmp;
2039 char ifname[16 + 1];
2040 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2041 && !strcmp(name, ifname))
2043 netdev_dev->in6 = in6_tmp;
2049 netdev_dev->cache_valid |= VALID_IN6;
2051 *in6 = netdev_dev->in6;
2056 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2058 struct sockaddr_in sin;
2059 memset(&sin, 0, sizeof sin);
2060 sin.sin_family = AF_INET;
2061 sin.sin_addr = addr;
2064 memset(sa, 0, sizeof *sa);
2065 memcpy(sa, &sin, sizeof sin);
2069 do_set_addr(struct netdev *netdev,
2070 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2073 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2074 make_in4_sockaddr(&ifr.ifr_addr, addr);
2076 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2080 /* Adds 'router' as a default IP gateway. */
2082 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2084 struct in_addr any = { INADDR_ANY };
2088 memset(&rt, 0, sizeof rt);
2089 make_in4_sockaddr(&rt.rt_dst, any);
2090 make_in4_sockaddr(&rt.rt_gateway, router);
2091 make_in4_sockaddr(&rt.rt_genmask, any);
2092 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2093 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2095 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2101 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2104 static const char fn[] = "/proc/net/route";
2109 *netdev_name = NULL;
2110 stream = fopen(fn, "r");
2111 if (stream == NULL) {
2112 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2117 while (fgets(line, sizeof line, stream)) {
2120 ovs_be32 dest, gateway, mask;
2121 int refcnt, metric, mtu;
2122 unsigned int flags, use, window, irtt;
2125 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2127 iface, &dest, &gateway, &flags, &refcnt,
2128 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2130 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2134 if (!(flags & RTF_UP)) {
2135 /* Skip routes that aren't up. */
2139 /* The output of 'dest', 'mask', and 'gateway' were given in
2140 * network byte order, so we don't need need any endian
2141 * conversions here. */
2142 if ((dest & mask) == (host->s_addr & mask)) {
2144 /* The host is directly reachable. */
2145 next_hop->s_addr = 0;
2147 /* To reach the host, we must go through a gateway. */
2148 next_hop->s_addr = gateway;
2150 *netdev_name = xstrdup(iface);
2162 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2164 struct ethtool_drvinfo drvinfo;
2167 memset(&drvinfo, 0, sizeof drvinfo);
2168 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2169 (struct ethtool_cmd *)&drvinfo,
2171 "ETHTOOL_GDRVINFO");
2173 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2174 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2175 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2181 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2182 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2183 * returns 0. Otherwise, it returns a positive errno value; in particular,
2184 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2186 netdev_linux_arp_lookup(const struct netdev *netdev,
2187 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2190 struct sockaddr_in sin;
2193 memset(&r, 0, sizeof r);
2194 memset(&sin, 0, sizeof sin);
2195 sin.sin_family = AF_INET;
2196 sin.sin_addr.s_addr = ip;
2198 memcpy(&r.arp_pa, &sin, sizeof sin);
2199 r.arp_ha.sa_family = ARPHRD_ETHER;
2201 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2202 COVERAGE_INC(netdev_arp_lookup);
2203 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2205 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2206 } else if (retval != ENXIO) {
2207 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2208 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2214 nd_to_iff_flags(enum netdev_flags nd)
2217 if (nd & NETDEV_UP) {
2220 if (nd & NETDEV_PROMISC) {
2227 iff_to_nd_flags(int iff)
2229 enum netdev_flags nd = 0;
2233 if (iff & IFF_PROMISC) {
2234 nd |= NETDEV_PROMISC;
2240 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2241 enum netdev_flags on, enum netdev_flags *old_flagsp)
2243 int old_flags, new_flags;
2246 error = get_flags(netdev, &old_flags);
2248 *old_flagsp = iff_to_nd_flags(old_flags);
2249 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2250 if (new_flags != old_flags) {
2251 error = set_flags(netdev, new_flags);
2258 netdev_linux_change_seq(const struct netdev *netdev)
2260 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2263 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2267 netdev_linux_init, \
2269 netdev_linux_wait, \
2272 netdev_linux_destroy, \
2273 NULL, /* get_config */ \
2274 NULL, /* set_config */ \
2276 netdev_linux_open, \
2277 netdev_linux_close, \
2279 netdev_linux_listen, \
2280 netdev_linux_recv, \
2281 netdev_linux_recv_wait, \
2282 netdev_linux_drain, \
2284 netdev_linux_send, \
2285 netdev_linux_send_wait, \
2287 netdev_linux_set_etheraddr, \
2288 netdev_linux_get_etheraddr, \
2289 netdev_linux_get_mtu, \
2290 netdev_linux_set_mtu, \
2291 netdev_linux_get_ifindex, \
2292 netdev_linux_get_carrier, \
2293 netdev_linux_get_carrier_resets, \
2294 netdev_linux_set_miimon_interval, \
2298 netdev_linux_get_features, \
2299 netdev_linux_set_advertisements, \
2301 netdev_linux_set_policing, \
2302 netdev_linux_get_qos_types, \
2303 netdev_linux_get_qos_capabilities, \
2304 netdev_linux_get_qos, \
2305 netdev_linux_set_qos, \
2306 netdev_linux_get_queue, \
2307 netdev_linux_set_queue, \
2308 netdev_linux_delete_queue, \
2309 netdev_linux_get_queue_stats, \
2310 netdev_linux_dump_queues, \
2311 netdev_linux_dump_queue_stats, \
2313 netdev_linux_get_in4, \
2314 netdev_linux_set_in4, \
2315 netdev_linux_get_in6, \
2316 netdev_linux_add_router, \
2317 netdev_linux_get_next_hop, \
2318 netdev_linux_get_status, \
2319 netdev_linux_arp_lookup, \
2321 netdev_linux_update_flags, \
2323 netdev_linux_change_seq \
2326 const struct netdev_class netdev_linux_class =
2329 netdev_linux_create,
2330 netdev_linux_get_stats,
2331 NULL); /* set_stats */
2333 const struct netdev_class netdev_tap_class =
2336 netdev_linux_create_tap,
2337 netdev_pseudo_get_stats,
2338 NULL); /* set_stats */
2340 const struct netdev_class netdev_internal_class =
2343 netdev_linux_create,
2344 netdev_pseudo_get_stats,
2345 netdev_vport_set_stats);
2347 /* HTB traffic control class. */
2349 #define HTB_N_QUEUES 0xf000
2353 unsigned int max_rate; /* In bytes/s. */
2357 struct tc_queue tc_queue;
2358 unsigned int min_rate; /* In bytes/s. */
2359 unsigned int max_rate; /* In bytes/s. */
2360 unsigned int burst; /* In bytes. */
2361 unsigned int priority; /* Lower values are higher priorities. */
2365 htb_get__(const struct netdev *netdev)
2367 struct netdev_dev_linux *netdev_dev =
2368 netdev_dev_linux_cast(netdev_get_dev(netdev));
2369 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2373 htb_install__(struct netdev *netdev, uint64_t max_rate)
2375 struct netdev_dev_linux *netdev_dev =
2376 netdev_dev_linux_cast(netdev_get_dev(netdev));
2379 htb = xmalloc(sizeof *htb);
2380 tc_init(&htb->tc, &tc_ops_htb);
2381 htb->max_rate = max_rate;
2383 netdev_dev->tc = &htb->tc;
2386 /* Create an HTB qdisc.
2388 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2390 htb_setup_qdisc__(struct netdev *netdev)
2393 struct tc_htb_glob opt;
2394 struct ofpbuf request;
2395 struct tcmsg *tcmsg;
2397 tc_del_qdisc(netdev);
2399 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2400 NLM_F_EXCL | NLM_F_CREATE, &request);
2404 tcmsg->tcm_handle = tc_make_handle(1, 0);
2405 tcmsg->tcm_parent = TC_H_ROOT;
2407 nl_msg_put_string(&request, TCA_KIND, "htb");
2409 memset(&opt, 0, sizeof opt);
2410 opt.rate2quantum = 10;
2414 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2415 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2416 nl_msg_end_nested(&request, opt_offset);
2418 return tc_transact(&request, NULL);
2421 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2422 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2424 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2425 unsigned int parent, struct htb_class *class)
2428 struct tc_htb_opt opt;
2429 struct ofpbuf request;
2430 struct tcmsg *tcmsg;
2434 error = netdev_get_mtu(netdev, &mtu);
2436 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2437 netdev_get_name(netdev));
2441 memset(&opt, 0, sizeof opt);
2442 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2443 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2444 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2445 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2446 opt.prio = class->priority;
2448 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2452 tcmsg->tcm_handle = handle;
2453 tcmsg->tcm_parent = parent;
2455 nl_msg_put_string(&request, TCA_KIND, "htb");
2456 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2457 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2458 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2459 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2460 nl_msg_end_nested(&request, opt_offset);
2462 error = tc_transact(&request, NULL);
2464 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2465 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2466 netdev_get_name(netdev),
2467 tc_get_major(handle), tc_get_minor(handle),
2468 tc_get_major(parent), tc_get_minor(parent),
2469 class->min_rate, class->max_rate,
2470 class->burst, class->priority, strerror(error));
2475 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2476 * description of them into 'details'. The description complies with the
2477 * specification given in the vswitch database documentation for linux-htb
2480 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2482 static const struct nl_policy tca_htb_policy[] = {
2483 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2484 .min_len = sizeof(struct tc_htb_opt) },
2487 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2488 const struct tc_htb_opt *htb;
2490 if (!nl_parse_nested(nl_options, tca_htb_policy,
2491 attrs, ARRAY_SIZE(tca_htb_policy))) {
2492 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2496 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2497 class->min_rate = htb->rate.rate;
2498 class->max_rate = htb->ceil.rate;
2499 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2500 class->priority = htb->prio;
2505 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2506 struct htb_class *options,
2507 struct netdev_queue_stats *stats)
2509 struct nlattr *nl_options;
2510 unsigned int handle;
2513 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2514 if (!error && queue_id) {
2515 unsigned int major = tc_get_major(handle);
2516 unsigned int minor = tc_get_minor(handle);
2517 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2518 *queue_id = minor - 1;
2523 if (!error && options) {
2524 error = htb_parse_tca_options__(nl_options, options);
2530 htb_parse_qdisc_details__(struct netdev *netdev,
2531 const struct shash *details, struct htb_class *hc)
2533 const char *max_rate_s;
2535 max_rate_s = shash_find_data(details, "max-rate");
2536 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2537 if (!hc->max_rate) {
2540 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2541 hc->max_rate = netdev_features_to_bps(current) / 8;
2543 hc->min_rate = hc->max_rate;
2549 htb_parse_class_details__(struct netdev *netdev,
2550 const struct shash *details, struct htb_class *hc)
2552 const struct htb *htb = htb_get__(netdev);
2553 const char *min_rate_s = shash_find_data(details, "min-rate");
2554 const char *max_rate_s = shash_find_data(details, "max-rate");
2555 const char *burst_s = shash_find_data(details, "burst");
2556 const char *priority_s = shash_find_data(details, "priority");
2559 error = netdev_get_mtu(netdev, &mtu);
2561 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2562 netdev_get_name(netdev));
2566 /* HTB requires at least an mtu sized min-rate to send any traffic even
2567 * on uncongested links. */
2568 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2569 hc->min_rate = MAX(hc->min_rate, mtu);
2570 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2573 hc->max_rate = (max_rate_s
2574 ? strtoull(max_rate_s, NULL, 10) / 8
2576 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2577 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2581 * According to hints in the documentation that I've read, it is important
2582 * that 'burst' be at least as big as the largest frame that might be
2583 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2584 * but having it a bit too small is a problem. Since netdev_get_mtu()
2585 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2586 * the MTU. We actually add 64, instead of 14, as a guard against
2587 * additional headers get tacked on somewhere that we're not aware of. */
2588 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2589 hc->burst = MAX(hc->burst, mtu + 64);
2592 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2598 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2599 unsigned int parent, struct htb_class *options,
2600 struct netdev_queue_stats *stats)
2602 struct ofpbuf *reply;
2605 error = tc_query_class(netdev, handle, parent, &reply);
2607 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2608 ofpbuf_delete(reply);
2614 htb_tc_install(struct netdev *netdev, const struct shash *details)
2618 error = htb_setup_qdisc__(netdev);
2620 struct htb_class hc;
2622 htb_parse_qdisc_details__(netdev, details, &hc);
2623 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2624 tc_make_handle(1, 0), &hc);
2626 htb_install__(netdev, hc.max_rate);
2632 static struct htb_class *
2633 htb_class_cast__(const struct tc_queue *queue)
2635 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2639 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2640 const struct htb_class *hc)
2642 struct htb *htb = htb_get__(netdev);
2643 size_t hash = hash_int(queue_id, 0);
2644 struct tc_queue *queue;
2645 struct htb_class *hcp;
2647 queue = tc_find_queue__(netdev, queue_id, hash);
2649 hcp = htb_class_cast__(queue);
2651 hcp = xmalloc(sizeof *hcp);
2652 queue = &hcp->tc_queue;
2653 queue->queue_id = queue_id;
2654 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2657 hcp->min_rate = hc->min_rate;
2658 hcp->max_rate = hc->max_rate;
2659 hcp->burst = hc->burst;
2660 hcp->priority = hc->priority;
2664 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2667 struct nl_dump dump;
2668 struct htb_class hc;
2670 /* Get qdisc options. */
2672 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2673 htb_install__(netdev, hc.max_rate);
2676 if (!start_queue_dump(netdev, &dump)) {
2679 while (nl_dump_next(&dump, &msg)) {
2680 unsigned int queue_id;
2682 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2683 htb_update_queue__(netdev, queue_id, &hc);
2686 nl_dump_done(&dump);
2692 htb_tc_destroy(struct tc *tc)
2694 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2695 struct htb_class *hc, *next;
2697 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2698 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2706 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2708 const struct htb *htb = htb_get__(netdev);
2709 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2714 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2716 struct htb_class hc;
2719 htb_parse_qdisc_details__(netdev, details, &hc);
2720 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2721 tc_make_handle(1, 0), &hc);
2723 htb_get__(netdev)->max_rate = hc.max_rate;
2729 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2730 const struct tc_queue *queue, struct shash *details)
2732 const struct htb_class *hc = htb_class_cast__(queue);
2734 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2735 if (hc->min_rate != hc->max_rate) {
2736 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2738 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2740 shash_add(details, "priority", xasprintf("%u", hc->priority));
2746 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2747 const struct shash *details)
2749 struct htb_class hc;
2752 error = htb_parse_class_details__(netdev, details, &hc);
2757 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2758 tc_make_handle(1, 0xfffe), &hc);
2763 htb_update_queue__(netdev, queue_id, &hc);
2768 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2770 struct htb_class *hc = htb_class_cast__(queue);
2771 struct htb *htb = htb_get__(netdev);
2774 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2776 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2783 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2784 struct netdev_queue_stats *stats)
2786 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2787 tc_make_handle(1, 0xfffe), NULL, stats);
2791 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2792 const struct ofpbuf *nlmsg,
2793 netdev_dump_queue_stats_cb *cb, void *aux)
2795 struct netdev_queue_stats stats;
2796 unsigned int handle, major, minor;
2799 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2804 major = tc_get_major(handle);
2805 minor = tc_get_minor(handle);
2806 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2807 (*cb)(minor - 1, &stats, aux);
2812 static const struct tc_ops tc_ops_htb = {
2813 "htb", /* linux_name */
2814 "linux-htb", /* ovs_name */
2815 HTB_N_QUEUES, /* n_queues */
2824 htb_class_get_stats,
2825 htb_class_dump_stats
2828 /* "linux-hfsc" traffic control class. */
2830 #define HFSC_N_QUEUES 0xf000
2838 struct tc_queue tc_queue;
2843 static struct hfsc *
2844 hfsc_get__(const struct netdev *netdev)
2846 struct netdev_dev_linux *netdev_dev;
2847 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2848 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2851 static struct hfsc_class *
2852 hfsc_class_cast__(const struct tc_queue *queue)
2854 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2858 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2860 struct netdev_dev_linux * netdev_dev;
2863 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2864 hfsc = xmalloc(sizeof *hfsc);
2865 tc_init(&hfsc->tc, &tc_ops_hfsc);
2866 hfsc->max_rate = max_rate;
2867 netdev_dev->tc = &hfsc->tc;
2871 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2872 const struct hfsc_class *hc)
2876 struct hfsc_class *hcp;
2877 struct tc_queue *queue;
2879 hfsc = hfsc_get__(netdev);
2880 hash = hash_int(queue_id, 0);
2882 queue = tc_find_queue__(netdev, queue_id, hash);
2884 hcp = hfsc_class_cast__(queue);
2886 hcp = xmalloc(sizeof *hcp);
2887 queue = &hcp->tc_queue;
2888 queue->queue_id = queue_id;
2889 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2892 hcp->min_rate = hc->min_rate;
2893 hcp->max_rate = hc->max_rate;
2897 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2899 const struct tc_service_curve *rsc, *fsc, *usc;
2900 static const struct nl_policy tca_hfsc_policy[] = {
2902 .type = NL_A_UNSPEC,
2904 .min_len = sizeof(struct tc_service_curve),
2907 .type = NL_A_UNSPEC,
2909 .min_len = sizeof(struct tc_service_curve),
2912 .type = NL_A_UNSPEC,
2914 .min_len = sizeof(struct tc_service_curve),
2917 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2919 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2920 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2921 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2925 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2926 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2927 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2929 if (rsc->m1 != 0 || rsc->d != 0 ||
2930 fsc->m1 != 0 || fsc->d != 0 ||
2931 usc->m1 != 0 || usc->d != 0) {
2932 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2933 "Non-linear service curves are not supported.");
2937 if (rsc->m2 != fsc->m2) {
2938 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2939 "Real-time service curves are not supported ");
2943 if (rsc->m2 > usc->m2) {
2944 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2945 "Min-rate service curve is greater than "
2946 "the max-rate service curve.");
2950 class->min_rate = fsc->m2;
2951 class->max_rate = usc->m2;
2956 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2957 struct hfsc_class *options,
2958 struct netdev_queue_stats *stats)
2961 unsigned int handle;
2962 struct nlattr *nl_options;
2964 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2970 unsigned int major, minor;
2972 major = tc_get_major(handle);
2973 minor = tc_get_minor(handle);
2974 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2975 *queue_id = minor - 1;
2982 error = hfsc_parse_tca_options__(nl_options, options);
2989 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2990 unsigned int parent, struct hfsc_class *options,
2991 struct netdev_queue_stats *stats)
2994 struct ofpbuf *reply;
2996 error = tc_query_class(netdev, handle, parent, &reply);
3001 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3002 ofpbuf_delete(reply);
3007 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3008 struct hfsc_class *class)
3011 const char *max_rate_s;
3013 max_rate_s = shash_find_data(details, "max-rate");
3014 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3019 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3020 max_rate = netdev_features_to_bps(current) / 8;
3023 class->min_rate = max_rate;
3024 class->max_rate = max_rate;
3028 hfsc_parse_class_details__(struct netdev *netdev,
3029 const struct shash *details,
3030 struct hfsc_class * class)
3032 const struct hfsc *hfsc;
3033 uint32_t min_rate, max_rate;
3034 const char *min_rate_s, *max_rate_s;
3036 hfsc = hfsc_get__(netdev);
3037 min_rate_s = shash_find_data(details, "min-rate");
3038 max_rate_s = shash_find_data(details, "max-rate");
3040 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3041 min_rate = MAX(min_rate, 1);
3042 min_rate = MIN(min_rate, hfsc->max_rate);
3044 max_rate = (max_rate_s
3045 ? strtoull(max_rate_s, NULL, 10) / 8
3047 max_rate = MAX(max_rate, min_rate);
3048 max_rate = MIN(max_rate, hfsc->max_rate);
3050 class->min_rate = min_rate;
3051 class->max_rate = max_rate;
3056 /* Create an HFSC qdisc.
3058 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3060 hfsc_setup_qdisc__(struct netdev * netdev)
3062 struct tcmsg *tcmsg;
3063 struct ofpbuf request;
3064 struct tc_hfsc_qopt opt;
3066 tc_del_qdisc(netdev);
3068 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3069 NLM_F_EXCL | NLM_F_CREATE, &request);
3075 tcmsg->tcm_handle = tc_make_handle(1, 0);
3076 tcmsg->tcm_parent = TC_H_ROOT;
3078 memset(&opt, 0, sizeof opt);
3081 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3082 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3084 return tc_transact(&request, NULL);
3087 /* Create an HFSC class.
3089 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3090 * sc rate <min_rate> ul rate <max_rate>" */
3092 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3093 unsigned int parent, struct hfsc_class *class)
3097 struct tcmsg *tcmsg;
3098 struct ofpbuf request;
3099 struct tc_service_curve min, max;
3101 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3107 tcmsg->tcm_handle = handle;
3108 tcmsg->tcm_parent = parent;
3112 min.m2 = class->min_rate;
3116 max.m2 = class->max_rate;
3118 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3119 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3120 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3121 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3122 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3123 nl_msg_end_nested(&request, opt_offset);
3125 error = tc_transact(&request, NULL);
3127 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3128 "min-rate %ubps, max-rate %ubps (%s)",
3129 netdev_get_name(netdev),
3130 tc_get_major(handle), tc_get_minor(handle),
3131 tc_get_major(parent), tc_get_minor(parent),
3132 class->min_rate, class->max_rate, strerror(error));
3139 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3142 struct hfsc_class class;
3144 error = hfsc_setup_qdisc__(netdev);
3150 hfsc_parse_qdisc_details__(netdev, details, &class);
3151 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3152 tc_make_handle(1, 0), &class);
3158 hfsc_install__(netdev, class.max_rate);
3163 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3166 struct nl_dump dump;
3167 struct hfsc_class hc;
3170 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3171 hfsc_install__(netdev, hc.max_rate);
3173 if (!start_queue_dump(netdev, &dump)) {
3177 while (nl_dump_next(&dump, &msg)) {
3178 unsigned int queue_id;
3180 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3181 hfsc_update_queue__(netdev, queue_id, &hc);
3185 nl_dump_done(&dump);
3190 hfsc_tc_destroy(struct tc *tc)
3193 struct hfsc_class *hc, *next;
3195 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3197 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3198 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3207 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3209 const struct hfsc *hfsc;
3210 hfsc = hfsc_get__(netdev);
3211 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3216 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3219 struct hfsc_class class;
3221 hfsc_parse_qdisc_details__(netdev, details, &class);
3222 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3223 tc_make_handle(1, 0), &class);
3226 hfsc_get__(netdev)->max_rate = class.max_rate;
3233 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3234 const struct tc_queue *queue, struct shash *details)
3236 const struct hfsc_class *hc;
3238 hc = hfsc_class_cast__(queue);
3239 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3240 if (hc->min_rate != hc->max_rate) {
3241 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3247 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3248 const struct shash *details)
3251 struct hfsc_class class;
3253 error = hfsc_parse_class_details__(netdev, details, &class);
3258 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3259 tc_make_handle(1, 0xfffe), &class);
3264 hfsc_update_queue__(netdev, queue_id, &class);
3269 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3273 struct hfsc_class *hc;
3275 hc = hfsc_class_cast__(queue);
3276 hfsc = hfsc_get__(netdev);
3278 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3280 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3287 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3288 struct netdev_queue_stats *stats)
3290 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3291 tc_make_handle(1, 0xfffe), NULL, stats);
3295 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3296 const struct ofpbuf *nlmsg,
3297 netdev_dump_queue_stats_cb *cb, void *aux)
3299 struct netdev_queue_stats stats;
3300 unsigned int handle, major, minor;
3303 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3308 major = tc_get_major(handle);
3309 minor = tc_get_minor(handle);
3310 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3311 (*cb)(minor - 1, &stats, aux);
3316 static const struct tc_ops tc_ops_hfsc = {
3317 "hfsc", /* linux_name */
3318 "linux-hfsc", /* ovs_name */
3319 HFSC_N_QUEUES, /* n_queues */
3320 hfsc_tc_install, /* tc_install */
3321 hfsc_tc_load, /* tc_load */
3322 hfsc_tc_destroy, /* tc_destroy */
3323 hfsc_qdisc_get, /* qdisc_get */
3324 hfsc_qdisc_set, /* qdisc_set */
3325 hfsc_class_get, /* class_get */
3326 hfsc_class_set, /* class_set */
3327 hfsc_class_delete, /* class_delete */
3328 hfsc_class_get_stats, /* class_get_stats */
3329 hfsc_class_dump_stats /* class_dump_stats */
3332 /* "linux-default" traffic control class.
3334 * This class represents the default, unnamed Linux qdisc. It corresponds to
3335 * the "" (empty string) QoS type in the OVS database. */
3338 default_install__(struct netdev *netdev)
3340 struct netdev_dev_linux *netdev_dev =
3341 netdev_dev_linux_cast(netdev_get_dev(netdev));
3342 static struct tc *tc;
3345 tc = xmalloc(sizeof *tc);
3346 tc_init(tc, &tc_ops_default);
3348 netdev_dev->tc = tc;
3352 default_tc_install(struct netdev *netdev,
3353 const struct shash *details OVS_UNUSED)
3355 default_install__(netdev);
3360 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3362 default_install__(netdev);
3366 static const struct tc_ops tc_ops_default = {
3367 NULL, /* linux_name */
3372 NULL, /* tc_destroy */
3373 NULL, /* qdisc_get */
3374 NULL, /* qdisc_set */
3375 NULL, /* class_get */
3376 NULL, /* class_set */
3377 NULL, /* class_delete */
3378 NULL, /* class_get_stats */
3379 NULL /* class_dump_stats */
3382 /* "linux-other" traffic control class.
3387 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3389 struct netdev_dev_linux *netdev_dev =
3390 netdev_dev_linux_cast(netdev_get_dev(netdev));
3391 static struct tc *tc;
3394 tc = xmalloc(sizeof *tc);
3395 tc_init(tc, &tc_ops_other);
3397 netdev_dev->tc = tc;
3401 static const struct tc_ops tc_ops_other = {
3402 NULL, /* linux_name */
3403 "linux-other", /* ovs_name */
3405 NULL, /* tc_install */
3407 NULL, /* tc_destroy */
3408 NULL, /* qdisc_get */
3409 NULL, /* qdisc_set */
3410 NULL, /* class_get */
3411 NULL, /* class_set */
3412 NULL, /* class_delete */
3413 NULL, /* class_get_stats */
3414 NULL /* class_dump_stats */
3417 /* Traffic control. */
3419 /* Number of kernel "tc" ticks per second. */
3420 static double ticks_per_s;
3422 /* Number of kernel "jiffies" per second. This is used for the purpose of
3423 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3424 * one jiffy's worth of data.
3426 * There are two possibilities here:
3428 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3429 * approximate range of 100 to 1024. That means that we really need to
3430 * make sure that the qdisc can buffer that much data.
3432 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3433 * has finely granular timers and there's no need to fudge additional room
3434 * for buffers. (There's no extra effort needed to implement that: the
3435 * large 'buffer_hz' is used as a divisor, so practically any number will
3436 * come out as 0 in the division. Small integer results in the case of
3437 * really high dividends won't have any real effect anyhow.)
3439 static unsigned int buffer_hz;
3441 /* Returns tc handle 'major':'minor'. */
3443 tc_make_handle(unsigned int major, unsigned int minor)
3445 return TC_H_MAKE(major << 16, minor);
3448 /* Returns the major number from 'handle'. */
3450 tc_get_major(unsigned int handle)
3452 return TC_H_MAJ(handle) >> 16;
3455 /* Returns the minor number from 'handle'. */
3457 tc_get_minor(unsigned int handle)
3459 return TC_H_MIN(handle);
3462 static struct tcmsg *
3463 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3464 struct ofpbuf *request)
3466 struct tcmsg *tcmsg;
3470 error = get_ifindex(netdev, &ifindex);
3475 ofpbuf_init(request, 512);
3476 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3477 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3478 tcmsg->tcm_family = AF_UNSPEC;
3479 tcmsg->tcm_ifindex = ifindex;
3480 /* Caller should fill in tcmsg->tcm_handle. */
3481 /* Caller should fill in tcmsg->tcm_parent. */
3487 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3489 int error = nl_sock_transact(rtnl_sock, request, replyp);
3490 ofpbuf_uninit(request);
3497 /* The values in psched are not individually very meaningful, but they are
3498 * important. The tables below show some values seen in the wild.
3502 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3503 * (Before that, there are hints that it was 1000000000.)
3505 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3509 * -----------------------------------
3510 * [1] 000c8000 000f4240 000f4240 00000064
3511 * [2] 000003e8 00000400 000f4240 3b9aca00
3512 * [3] 000003e8 00000400 000f4240 3b9aca00
3513 * [4] 000003e8 00000400 000f4240 00000064
3514 * [5] 000003e8 00000040 000f4240 3b9aca00
3515 * [6] 000003e8 00000040 000f4240 000000f9
3517 * a b c d ticks_per_s buffer_hz
3518 * ------- --------- ---------- ------------- ----------- -------------
3519 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3520 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3521 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3522 * [4] 1,000 1,024 1,000,000 100 976,562 100
3523 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3524 * [6] 1,000 64 1,000,000 249 15,625,000 249
3526 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3527 * [2] 2.6.26-1-686-bigmem from Debian lenny
3528 * [3] 2.6.26-2-sparc64 from Debian lenny
3529 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3530 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3531 * [6] 2.6.34 from kernel.org on KVM
3533 static const char fn[] = "/proc/net/psched";
3534 unsigned int a, b, c, d;
3540 stream = fopen(fn, "r");
3542 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3546 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3547 VLOG_WARN("%s: read failed", fn);
3551 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3555 VLOG_WARN("%s: invalid scheduler parameters", fn);
3559 ticks_per_s = (double) a * c / b;
3563 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3566 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3569 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3570 * rate of 'rate' bytes per second. */
3572 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3577 return (rate * ticks) / ticks_per_s;
3580 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3581 * rate of 'rate' bytes per second. */
3583 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3588 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3591 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3592 * a transmission rate of 'rate' bytes per second. */
3594 tc_buffer_per_jiffy(unsigned int rate)
3599 return rate / buffer_hz;
3602 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3603 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3604 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3605 * stores NULL into it if it is absent.
3607 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3610 * Returns 0 if successful, otherwise a positive errno value. */
3612 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3613 struct nlattr **options)
3615 static const struct nl_policy tca_policy[] = {
3616 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3617 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3619 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3621 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3622 tca_policy, ta, ARRAY_SIZE(ta))) {
3623 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3628 *kind = nl_attr_get_string(ta[TCA_KIND]);
3632 *options = ta[TCA_OPTIONS];
3647 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3648 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3649 * into '*options', and its queue statistics into '*stats'. Any of the output
3650 * arguments may be null.
3652 * Returns 0 if successful, otherwise a positive errno value. */
3654 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3655 struct nlattr **options, struct netdev_queue_stats *stats)
3657 static const struct nl_policy tca_policy[] = {
3658 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3659 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3661 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3663 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3664 tca_policy, ta, ARRAY_SIZE(ta))) {
3665 VLOG_WARN_RL(&rl, "failed to parse class message");
3670 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3671 *handlep = tc->tcm_handle;
3675 *options = ta[TCA_OPTIONS];
3679 const struct gnet_stats_queue *gsq;
3680 struct gnet_stats_basic gsb;
3682 static const struct nl_policy stats_policy[] = {
3683 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3684 .min_len = sizeof gsb },
3685 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3686 .min_len = sizeof *gsq },
3688 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3690 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3691 sa, ARRAY_SIZE(sa))) {
3692 VLOG_WARN_RL(&rl, "failed to parse class stats");
3696 /* Alignment issues screw up the length of struct gnet_stats_basic on
3697 * some arch/bitsize combinations. Newer versions of Linux have a
3698 * struct gnet_stats_basic_packed, but we can't depend on that. The
3699 * easiest thing to do is just to make a copy. */
3700 memset(&gsb, 0, sizeof gsb);
3701 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3702 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3703 stats->tx_bytes = gsb.bytes;
3704 stats->tx_packets = gsb.packets;
3706 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3707 stats->tx_errors = gsq->drops;
3717 memset(stats, 0, sizeof *stats);
3722 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3725 tc_query_class(const struct netdev *netdev,
3726 unsigned int handle, unsigned int parent,
3727 struct ofpbuf **replyp)
3729 struct ofpbuf request;
3730 struct tcmsg *tcmsg;
3733 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3737 tcmsg->tcm_handle = handle;
3738 tcmsg->tcm_parent = parent;
3740 error = tc_transact(&request, replyp);
3742 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3743 netdev_get_name(netdev),
3744 tc_get_major(handle), tc_get_minor(handle),
3745 tc_get_major(parent), tc_get_minor(parent),
3751 /* Equivalent to "tc class del dev <name> handle <handle>". */
3753 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3755 struct ofpbuf request;
3756 struct tcmsg *tcmsg;
3759 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3763 tcmsg->tcm_handle = handle;
3764 tcmsg->tcm_parent = 0;
3766 error = tc_transact(&request, NULL);
3768 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3769 netdev_get_name(netdev),
3770 tc_get_major(handle), tc_get_minor(handle),
3776 /* Equivalent to "tc qdisc del dev <name> root". */
3778 tc_del_qdisc(struct netdev *netdev)
3780 struct netdev_dev_linux *netdev_dev =
3781 netdev_dev_linux_cast(netdev_get_dev(netdev));
3782 struct ofpbuf request;
3783 struct tcmsg *tcmsg;
3786 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3790 tcmsg->tcm_handle = tc_make_handle(1, 0);
3791 tcmsg->tcm_parent = TC_H_ROOT;
3793 error = tc_transact(&request, NULL);
3794 if (error == EINVAL) {
3795 /* EINVAL probably means that the default qdisc was in use, in which
3796 * case we've accomplished our purpose. */
3799 if (!error && netdev_dev->tc) {
3800 if (netdev_dev->tc->ops->tc_destroy) {
3801 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3803 netdev_dev->tc = NULL;
3808 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3809 * kernel to determine what they are. Returns 0 if successful, otherwise a
3810 * positive errno value. */
3812 tc_query_qdisc(const struct netdev *netdev)
3814 struct netdev_dev_linux *netdev_dev =
3815 netdev_dev_linux_cast(netdev_get_dev(netdev));
3816 struct ofpbuf request, *qdisc;
3817 const struct tc_ops *ops;
3818 struct tcmsg *tcmsg;
3822 if (netdev_dev->tc) {
3826 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3827 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3828 * 2.6.35 without that fix backported to it.
3830 * To avoid the OOPS, we must not make a request that would attempt to dump
3831 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3832 * few others. There are a few ways that I can see to do this, but most of
3833 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3834 * technique chosen here is to assume that any non-default qdisc that we
3835 * create will have a class with handle 1:0. The built-in qdiscs only have
3836 * a class with handle 0:0.
3838 * We could check for Linux 2.6.35+ and use a more straightforward method
3840 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3844 tcmsg->tcm_handle = tc_make_handle(1, 0);
3845 tcmsg->tcm_parent = 0;
3847 /* Figure out what tc class to instantiate. */
3848 error = tc_transact(&request, &qdisc);
3852 error = tc_parse_qdisc(qdisc, &kind, NULL);
3854 ops = &tc_ops_other;
3856 ops = tc_lookup_linux_name(kind);
3858 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3859 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3861 ops = &tc_ops_other;
3864 } else if (error == ENOENT) {
3865 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3866 * other entity that doesn't have a handle 1:0. We will assume
3867 * that it's the system default qdisc. */
3868 ops = &tc_ops_default;
3871 /* Who knows? Maybe the device got deleted. */
3872 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3873 netdev_get_name(netdev), strerror(error));
3874 ops = &tc_ops_other;
3877 /* Instantiate it. */
3878 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3879 assert((load_error == 0) == (netdev_dev->tc != NULL));
3880 ofpbuf_delete(qdisc);
3882 return error ? error : load_error;
3885 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3886 approximate the time to transmit packets of various lengths. For an MTU of
3887 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3888 represents two possible packet lengths; for a MTU of 513 through 1024, four
3889 possible lengths; and so on.
3891 Returns, for the specified 'mtu', the number of bits that packet lengths
3892 need to be shifted right to fit within such a 256-entry table. */
3894 tc_calc_cell_log(unsigned int mtu)
3899 mtu = ETH_PAYLOAD_MAX;
3901 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3903 for (cell_log = 0; mtu >= 256; cell_log++) {
3910 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3913 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3915 memset(rate, 0, sizeof *rate);
3916 rate->cell_log = tc_calc_cell_log(mtu);
3917 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3918 /* rate->cell_align = 0; */ /* distro headers. */
3919 rate->mpu = ETH_TOTAL_MIN;
3923 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3924 * attribute of the specified "type".
3926 * See tc_calc_cell_log() above for a description of "rtab"s. */
3928 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3933 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3934 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3935 unsigned packet_size = (i + 1) << rate->cell_log;
3936 if (packet_size < rate->mpu) {
3937 packet_size = rate->mpu;
3939 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3943 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3944 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3945 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3948 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3950 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3951 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3954 /* Linux-only functions declared in netdev-linux.h */
3956 /* Returns a fd for an AF_INET socket or a negative errno value. */
3958 netdev_linux_get_af_inet_sock(void)
3960 int error = netdev_linux_init();
3961 return error ? -error : af_inet_sock;
3964 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
3965 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
3967 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
3968 const char *flag_name, bool enable)
3970 const char *netdev_name = netdev_get_name(netdev);
3971 struct ethtool_value evalue;
3975 memset(&evalue, 0, sizeof evalue);
3976 error = netdev_linux_do_ethtool(netdev_name,
3977 (struct ethtool_cmd *)&evalue,
3978 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3983 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
3984 error = netdev_linux_do_ethtool(netdev_name,
3985 (struct ethtool_cmd *)&evalue,
3986 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
3991 memset(&evalue, 0, sizeof evalue);
3992 error = netdev_linux_do_ethtool(netdev_name,
3993 (struct ethtool_cmd *)&evalue,
3994 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3999 if (new_flags != evalue.data) {
4000 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4001 "device %s failed", enable ? "enable" : "disable",
4002 flag_name, netdev_name);
4009 /* Utility functions. */
4011 /* Copies 'src' into 'dst', performing format conversion in the process. */
4013 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4014 const struct rtnl_link_stats *src)
4016 dst->rx_packets = src->rx_packets;
4017 dst->tx_packets = src->tx_packets;
4018 dst->rx_bytes = src->rx_bytes;
4019 dst->tx_bytes = src->tx_bytes;
4020 dst->rx_errors = src->rx_errors;
4021 dst->tx_errors = src->tx_errors;
4022 dst->rx_dropped = src->rx_dropped;
4023 dst->tx_dropped = src->tx_dropped;
4024 dst->multicast = src->multicast;
4025 dst->collisions = src->collisions;
4026 dst->rx_length_errors = src->rx_length_errors;
4027 dst->rx_over_errors = src->rx_over_errors;
4028 dst->rx_crc_errors = src->rx_crc_errors;
4029 dst->rx_frame_errors = src->rx_frame_errors;
4030 dst->rx_fifo_errors = src->rx_fifo_errors;
4031 dst->rx_missed_errors = src->rx_missed_errors;
4032 dst->tx_aborted_errors = src->tx_aborted_errors;
4033 dst->tx_carrier_errors = src->tx_carrier_errors;
4034 dst->tx_fifo_errors = src->tx_fifo_errors;
4035 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4036 dst->tx_window_errors = src->tx_window_errors;
4040 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4042 /* Policy for RTNLGRP_LINK messages.
4044 * There are *many* more fields in these messages, but currently we only
4045 * care about these fields. */
4046 static const struct nl_policy rtnlgrp_link_policy[] = {
4047 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4048 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4049 .min_len = sizeof(struct rtnl_link_stats) },
4052 struct ofpbuf request;
4053 struct ofpbuf *reply;
4054 struct ifinfomsg *ifi;
4055 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4058 ofpbuf_init(&request, 0);
4059 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4060 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4061 ifi->ifi_family = PF_UNSPEC;
4062 ifi->ifi_index = ifindex;
4063 error = nl_sock_transact(rtnl_sock, &request, &reply);
4064 ofpbuf_uninit(&request);
4069 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4070 rtnlgrp_link_policy,
4071 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4072 ofpbuf_delete(reply);
4076 if (!attrs[IFLA_STATS]) {
4077 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4078 ofpbuf_delete(reply);
4082 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4084 ofpbuf_delete(reply);
4090 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4092 static const char fn[] = "/proc/net/dev";
4097 stream = fopen(fn, "r");
4099 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4104 while (fgets(line, sizeof line, stream)) {
4107 #define X64 "%"SCNu64
4110 X64 X64 X64 X64 X64 X64 X64 "%*u"
4111 X64 X64 X64 X64 X64 X64 X64 "%*u",
4117 &stats->rx_fifo_errors,
4118 &stats->rx_frame_errors,
4124 &stats->tx_fifo_errors,
4126 &stats->tx_carrier_errors) != 15) {
4127 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4128 } else if (!strcmp(devname, netdev_name)) {
4129 stats->rx_length_errors = UINT64_MAX;
4130 stats->rx_over_errors = UINT64_MAX;
4131 stats->rx_crc_errors = UINT64_MAX;
4132 stats->rx_missed_errors = UINT64_MAX;
4133 stats->tx_aborted_errors = UINT64_MAX;
4134 stats->tx_heartbeat_errors = UINT64_MAX;
4135 stats->tx_window_errors = UINT64_MAX;
4141 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4147 get_carrier_via_sysfs(const char *name, bool *carrier)
4158 fn = xasprintf("/sys/class/net/%s/carrier", name);
4159 fd = open(fn, O_RDONLY);
4162 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4166 retval = read(fd, line, sizeof line);
4169 if (error == EINVAL) {
4170 /* This is the normal return value when we try to check carrier if
4171 * the network device is not up. */
4173 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4176 } else if (retval == 0) {
4178 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4182 if (line[0] != '0' && line[0] != '1') {
4184 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4187 *carrier = line[0] != '0';
4199 get_flags(const struct netdev *netdev, int *flags)
4204 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4206 *flags = ifr.ifr_flags;
4211 set_flags(struct netdev *netdev, int flags)
4215 ifr.ifr_flags = flags;
4216 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4221 do_get_ifindex(const char *netdev_name)
4225 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4226 COVERAGE_INC(netdev_get_ifindex);
4227 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4228 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4229 netdev_name, strerror(errno));
4232 return ifr.ifr_ifindex;
4236 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4238 struct netdev_dev_linux *netdev_dev =
4239 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4241 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4242 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4246 netdev_dev->cache_valid |= VALID_IFINDEX;
4247 netdev_dev->ifindex = ifindex;
4249 *ifindexp = netdev_dev->ifindex;
4254 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4259 memset(&ifr, 0, sizeof ifr);
4260 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4261 COVERAGE_INC(netdev_get_hwaddr);
4262 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4263 /* ENODEV probably means that a vif disappeared asynchronously and
4264 * hasn't been removed from the database yet, so reduce the log level
4265 * to INFO for that case. */
4266 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4267 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4268 netdev_name, strerror(errno));
4271 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4272 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4273 VLOG_WARN("%s device has unknown hardware address family %d",
4274 netdev_name, hwaddr_family);
4276 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4281 set_etheraddr(const char *netdev_name, int hwaddr_family,
4282 const uint8_t mac[ETH_ADDR_LEN])
4286 memset(&ifr, 0, sizeof ifr);
4287 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4288 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4289 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4290 COVERAGE_INC(netdev_set_hwaddr);
4291 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4292 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4293 netdev_name, strerror(errno));
4300 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4301 int cmd, const char *cmd_name)
4305 memset(&ifr, 0, sizeof ifr);
4306 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4307 ifr.ifr_data = (caddr_t) ecmd;
4310 COVERAGE_INC(netdev_ethtool);
4311 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4314 if (errno != EOPNOTSUPP) {
4315 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4316 "failed: %s", cmd_name, name, strerror(errno));
4318 /* The device doesn't support this operation. That's pretty
4319 * common, so there's no point in logging anything. */
4326 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4327 const char *cmd_name)
4329 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4330 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4331 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4339 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4340 int cmd, const char *cmd_name)
4345 ifr.ifr_addr.sa_family = AF_INET;
4346 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4348 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4349 *ip = sin->sin_addr;
4354 /* Returns an AF_PACKET raw socket or a negative errno value. */
4356 af_packet_sock(void)
4358 static int sock = INT_MIN;
4360 if (sock == INT_MIN) {
4361 sock = socket(AF_PACKET, SOCK_RAW, 0);
4363 set_nonblocking(sock);
4366 VLOG_ERR("failed to create packet socket: %s", strerror(errno));