2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_CARRIER = 1 << 5,
118 VALID_POLICING = 1 << 6,
119 VALID_HAVE_VPORT_STATS = 1 << 7
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct shash *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_dev_linux {
353 struct netdev_dev netdev_dev;
355 struct shash_node *shash_node;
356 unsigned int cache_valid;
357 unsigned int change_seq;
359 bool miimon; /* Link status of last poll. */
360 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
361 struct timer miimon_timer;
363 /* The following are figured out "on demand" only. They are only valid
364 * when the corresponding VALID_* bit in 'cache_valid' is set. */
366 uint8_t etheraddr[ETH_ADDR_LEN];
367 struct in_addr address, netmask;
371 uint32_t kbits_rate; /* Policing data. */
372 uint32_t kbits_burst;
373 bool have_vport_stats;
377 struct tap_state tap;
381 struct netdev_linux {
382 struct netdev netdev;
386 /* Sockets used for ioctl operations. */
387 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
389 /* A Netlink routing socket that is not subscribed to any multicast groups. */
390 static struct nl_sock *rtnl_sock;
392 /* This is set pretty low because we probably won't learn anything from the
393 * additional log messages. */
394 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
396 static int netdev_linux_init(void);
398 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
399 int cmd, const char *cmd_name);
400 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
401 const char *cmd_name);
402 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
403 int cmd, const char *cmd_name);
404 static int get_flags(const struct netdev *, int *flagsp);
405 static int set_flags(struct netdev *, int flags);
406 static int do_get_ifindex(const char *netdev_name);
407 static int get_ifindex(const struct netdev *, int *ifindexp);
408 static int do_set_addr(struct netdev *netdev,
409 int ioctl_nr, const char *ioctl_name,
410 struct in_addr addr);
411 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
412 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
413 const uint8_t[ETH_ADDR_LEN]);
414 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
415 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
416 static int af_packet_sock(void);
417 static void netdev_linux_miimon_run(void);
418 static void netdev_linux_miimon_wait(void);
421 is_netdev_linux_class(const struct netdev_class *netdev_class)
423 return netdev_class->init == netdev_linux_init;
426 static struct netdev_dev_linux *
427 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
429 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
430 assert(is_netdev_linux_class(netdev_class));
432 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
435 static struct netdev_linux *
436 netdev_linux_cast(const struct netdev *netdev)
438 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
439 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
440 assert(is_netdev_linux_class(netdev_class));
442 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
446 netdev_linux_init(void)
448 static int status = -1;
450 /* Create AF_INET socket. */
451 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
452 status = af_inet_sock >= 0 ? 0 : errno;
454 VLOG_ERR("failed to create inet socket: %s", strerror(status));
457 /* Create rtnetlink socket. */
459 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
461 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
470 netdev_linux_run(void)
472 rtnetlink_link_run();
473 netdev_linux_miimon_run();
477 netdev_linux_wait(void)
479 rtnetlink_link_wait();
480 netdev_linux_miimon_wait();
484 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
487 if (!dev->change_seq) {
490 dev->cache_valid = 0;
494 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
495 void *aux OVS_UNUSED)
497 struct netdev_dev_linux *dev;
499 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
501 const struct netdev_class *netdev_class =
502 netdev_dev_get_class(base_dev);
504 if (is_netdev_linux_class(netdev_class)) {
505 dev = netdev_dev_linux_cast(base_dev);
506 netdev_dev_linux_changed(dev);
510 struct shash device_shash;
511 struct shash_node *node;
513 shash_init(&device_shash);
514 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
515 SHASH_FOR_EACH (node, &device_shash) {
517 netdev_dev_linux_changed(dev);
519 shash_destroy(&device_shash);
523 /* Creates system and internal devices. */
525 netdev_linux_create(const struct netdev_class *class, const char *name,
526 struct netdev_dev **netdev_devp)
528 struct netdev_dev_linux *netdev_dev;
530 if (!cache_notifier_refcount) {
531 assert(!netdev_linux_cache_notifier);
533 netdev_linux_cache_notifier =
534 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
536 if (!netdev_linux_cache_notifier) {
540 cache_notifier_refcount++;
542 netdev_dev = xzalloc(sizeof *netdev_dev);
543 netdev_dev->change_seq = 1;
544 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
546 *netdev_devp = &netdev_dev->netdev_dev;
550 /* For most types of netdevs we open the device for each call of
551 * netdev_open(). However, this is not the case with tap devices,
552 * since it is only possible to open the device once. In this
553 * situation we share a single file descriptor, and consequently
554 * buffers, across all readers. Therefore once data is read it will
555 * be unavailable to other reads for tap devices. */
557 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
558 const char *name, struct netdev_dev **netdev_devp)
560 struct netdev_dev_linux *netdev_dev;
561 struct tap_state *state;
562 static const char tap_dev[] = "/dev/net/tun";
566 netdev_dev = xzalloc(sizeof *netdev_dev);
567 state = &netdev_dev->state.tap;
569 /* Open tap device. */
570 state->fd = open(tap_dev, O_RDWR);
573 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
577 /* Create tap device. */
578 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
579 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
580 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
581 VLOG_WARN("%s: creating tap device failed: %s", name,
587 /* Make non-blocking. */
588 error = set_nonblocking(state->fd);
593 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
594 *netdev_devp = &netdev_dev->netdev_dev;
603 destroy_tap(struct netdev_dev_linux *netdev_dev)
605 struct tap_state *state = &netdev_dev->state.tap;
607 if (state->fd >= 0) {
612 /* Destroys the netdev device 'netdev_dev_'. */
614 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
616 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
617 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
619 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
620 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
623 if (class == &netdev_linux_class || class == &netdev_internal_class) {
624 cache_notifier_refcount--;
626 if (!cache_notifier_refcount) {
627 assert(netdev_linux_cache_notifier);
628 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
629 netdev_linux_cache_notifier = NULL;
631 } else if (class == &netdev_tap_class) {
632 destroy_tap(netdev_dev);
641 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
643 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
644 struct netdev_linux *netdev;
645 enum netdev_flags flags;
648 /* Allocate network device. */
649 netdev = xzalloc(sizeof *netdev);
651 netdev_init(&netdev->netdev, netdev_dev_);
653 /* Verify that the device really exists, by attempting to read its flags.
654 * (The flags might be cached, in which case this won't actually do an
657 * Don't do this for "internal" netdevs, though, because those have to be
658 * created as netdev objects before they exist in the kernel, because
659 * creating them in the kernel happens by passing a netdev object to
660 * dpif_port_add(). */
661 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
662 error = netdev_get_flags(&netdev->netdev, &flags);
663 if (error == ENODEV) {
668 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
669 !netdev_dev->state.tap.opened) {
671 /* We assume that the first user of the tap device is the primary user
672 * and give them the tap FD. Subsequent users probably just expect
673 * this to be a system device so open it normally to avoid send/receive
674 * directions appearing to be reversed. */
675 netdev->fd = netdev_dev->state.tap.fd;
676 netdev_dev->state.tap.opened = true;
679 *netdevp = &netdev->netdev;
683 netdev_uninit(&netdev->netdev, true);
687 /* Closes and destroys 'netdev'. */
689 netdev_linux_close(struct netdev *netdev_)
691 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
693 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
699 /* Initializes 'sset' with a list of the names of all known network devices. */
701 netdev_linux_enumerate(struct sset *sset)
703 struct if_nameindex *names;
705 names = if_nameindex();
709 for (i = 0; names[i].if_name != NULL; i++) {
710 sset_add(sset, names[i].if_name);
712 if_freenameindex(names);
715 VLOG_WARN("could not obtain list of network device names: %s",
722 netdev_linux_listen(struct netdev *netdev_)
724 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
725 struct sockaddr_ll sll;
730 if (netdev->fd >= 0) {
734 /* Create file descriptor. */
735 fd = socket(PF_PACKET, SOCK_RAW, 0);
738 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
742 /* Set non-blocking mode. */
743 error = set_nonblocking(fd);
748 /* Get ethernet device index. */
749 error = get_ifindex(&netdev->netdev, &ifindex);
754 /* Bind to specific ethernet device. */
755 memset(&sll, 0, sizeof sll);
756 sll.sll_family = AF_PACKET;
757 sll.sll_ifindex = ifindex;
758 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
759 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
761 VLOG_ERR("%s: failed to bind raw socket (%s)",
762 netdev_get_name(netdev_), strerror(error));
777 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
781 if (netdev->fd < 0) {
782 /* Device is not listening. */
787 ssize_t retval = read(netdev->fd, data, size);
790 } else if (errno != EINTR) {
791 if (errno != EAGAIN) {
792 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
793 strerror(errno), netdev_get_name(netdev_));
800 /* Registers with the poll loop to wake up from the next call to poll_block()
801 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
803 netdev_linux_recv_wait(struct netdev *netdev_)
805 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
806 if (netdev->fd >= 0) {
807 poll_fd_wait(netdev->fd, POLLIN);
811 /* Discards all packets waiting to be received from 'netdev'. */
813 netdev_linux_drain(struct netdev *netdev_)
815 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
816 if (netdev->fd < 0) {
818 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
820 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
821 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
825 drain_fd(netdev->fd, ifr.ifr_qlen);
828 return drain_rcvbuf(netdev->fd);
832 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
833 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
834 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
835 * the packet is too big or too small to transmit on the device.
837 * The caller retains ownership of 'buffer' in all cases.
839 * The kernel maintains a packet transmission queue, so the caller is not
840 * expected to do additional queuing of packets. */
842 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
844 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
848 if (netdev->fd < 0) {
849 /* Use our AF_PACKET socket to send to this device. */
850 struct sockaddr_ll sll;
857 sock = af_packet_sock();
862 error = get_ifindex(netdev_, &ifindex);
867 /* We don't bother setting most fields in sockaddr_ll because the
868 * kernel ignores them for SOCK_RAW. */
869 memset(&sll, 0, sizeof sll);
870 sll.sll_family = AF_PACKET;
871 sll.sll_ifindex = ifindex;
873 iov.iov_base = (void *) data;
877 msg.msg_namelen = sizeof sll;
880 msg.msg_control = NULL;
881 msg.msg_controllen = 0;
884 retval = sendmsg(sock, &msg, 0);
886 /* Use the netdev's own fd to send to this device. This is
887 * essential for tap devices, because packets sent to a tap device
888 * with an AF_PACKET socket will loop back to be *received* again
889 * on the tap device. */
890 retval = write(netdev->fd, data, size);
894 /* The Linux AF_PACKET implementation never blocks waiting for room
895 * for packets, instead returning ENOBUFS. Translate this into
896 * EAGAIN for the caller. */
897 if (errno == ENOBUFS) {
899 } else if (errno == EINTR) {
901 } else if (errno != EAGAIN) {
902 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
903 netdev_get_name(netdev_), strerror(errno));
906 } else if (retval != size) {
907 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
908 "%zu) on %s", retval, size, netdev_get_name(netdev_));
916 /* Registers with the poll loop to wake up from the next call to poll_block()
917 * when the packet transmission queue has sufficient room to transmit a packet
918 * with netdev_send().
920 * The kernel maintains a packet transmission queue, so the client is not
921 * expected to do additional queuing of packets. Thus, this function is
922 * unlikely to ever be used. It is included for completeness. */
924 netdev_linux_send_wait(struct netdev *netdev_)
926 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
927 if (netdev->fd < 0) {
929 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
930 poll_fd_wait(netdev->fd, POLLOUT);
932 /* TAP device always accepts packets.*/
933 poll_immediate_wake();
937 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
938 * otherwise a positive errno value. */
940 netdev_linux_set_etheraddr(struct netdev *netdev_,
941 const uint8_t mac[ETH_ADDR_LEN])
943 struct netdev_dev_linux *netdev_dev =
944 netdev_dev_linux_cast(netdev_get_dev(netdev_));
947 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
948 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
949 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
951 netdev_dev->cache_valid |= VALID_ETHERADDR;
952 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
960 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
961 * free the returned buffer. */
963 netdev_linux_get_etheraddr(const struct netdev *netdev_,
964 uint8_t mac[ETH_ADDR_LEN])
966 struct netdev_dev_linux *netdev_dev =
967 netdev_dev_linux_cast(netdev_get_dev(netdev_));
968 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
969 int error = get_etheraddr(netdev_get_name(netdev_),
970 netdev_dev->etheraddr);
974 netdev_dev->cache_valid |= VALID_ETHERADDR;
976 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
980 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
981 * in bytes, not including the hardware header; thus, this is typically 1500
982 * bytes for Ethernet devices. */
984 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
986 struct netdev_dev_linux *netdev_dev =
987 netdev_dev_linux_cast(netdev_get_dev(netdev_));
988 if (!(netdev_dev->cache_valid & VALID_MTU)) {
992 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
993 SIOCGIFMTU, "SIOCGIFMTU");
997 netdev_dev->mtu = ifr.ifr_mtu;
998 netdev_dev->cache_valid |= VALID_MTU;
1000 *mtup = netdev_dev->mtu;
1004 /* Sets the maximum size of transmitted (MTU) for given device using linux
1005 * networking ioctl interface.
1008 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1010 struct netdev_dev_linux *netdev_dev =
1011 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1016 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1017 SIOCSIFMTU, "SIOCSIFMTU");
1022 netdev_dev->mtu = ifr.ifr_mtu;
1023 netdev_dev->cache_valid |= VALID_MTU;
1027 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1028 * On failure, returns a negative errno value. */
1030 netdev_linux_get_ifindex(const struct netdev *netdev)
1034 error = get_ifindex(netdev, &ifindex);
1035 return error ? -error : ifindex;
1039 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1041 struct netdev_dev_linux *netdev_dev =
1042 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1047 if (netdev_dev->miimon_interval > 0) {
1048 *carrier = netdev_dev->miimon;
1052 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1056 fn = xasprintf("/sys/class/net/%s/carrier",
1057 netdev_get_name(netdev_));
1058 fd = open(fn, O_RDONLY);
1061 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1065 retval = read(fd, line, sizeof line);
1068 if (error == EINVAL) {
1069 /* This is the normal return value when we try to check carrier
1070 * if the network device is not up. */
1072 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1075 } else if (retval == 0) {
1077 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1081 if (line[0] != '0' && line[0] != '1') {
1083 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1087 netdev_dev->carrier = line[0] != '0';
1088 netdev_dev->cache_valid |= VALID_CARRIER;
1090 *carrier = netdev_dev->carrier;
1102 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1103 struct mii_ioctl_data *data)
1108 memset(&ifr, 0, sizeof ifr);
1109 memcpy(&ifr.ifr_data, data, sizeof *data);
1110 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1111 memcpy(data, &ifr.ifr_data, sizeof *data);
1117 netdev_linux_get_miimon(const char *name, bool *miimon)
1119 struct mii_ioctl_data data;
1124 memset(&data, 0, sizeof data);
1125 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1127 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1128 data.reg_num = MII_BMSR;
1129 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1133 *miimon = !!(data.val_out & BMSR_LSTATUS);
1135 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1138 struct ethtool_cmd ecmd;
1140 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1143 memset(&ecmd, 0, sizeof ecmd);
1144 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1147 struct ethtool_value eval;
1149 memcpy(&eval, &ecmd, sizeof eval);
1150 *miimon = !!eval.data;
1152 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1160 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1161 long long int interval)
1163 struct netdev_dev_linux *netdev_dev;
1165 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1167 interval = interval > 0 ? MAX(interval, 100) : 0;
1168 if (netdev_dev->miimon_interval != interval) {
1169 netdev_dev->miimon_interval = interval;
1170 timer_set_expired(&netdev_dev->miimon_timer);
1177 netdev_linux_miimon_run(void)
1179 struct shash device_shash;
1180 struct shash_node *node;
1182 shash_init(&device_shash);
1183 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1184 SHASH_FOR_EACH (node, &device_shash) {
1185 struct netdev_dev_linux *dev = node->data;
1188 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1192 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1193 if (miimon != dev->miimon) {
1194 dev->miimon = miimon;
1195 netdev_dev_linux_changed(dev);
1198 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1201 shash_destroy(&device_shash);
1205 netdev_linux_miimon_wait(void)
1207 struct shash device_shash;
1208 struct shash_node *node;
1210 shash_init(&device_shash);
1211 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1212 SHASH_FOR_EACH (node, &device_shash) {
1213 struct netdev_dev_linux *dev = node->data;
1215 if (dev->miimon_interval > 0) {
1216 timer_wait(&dev->miimon_timer);
1219 shash_destroy(&device_shash);
1222 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1223 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1226 check_for_working_netlink_stats(void)
1228 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1229 * preferable, so if that works, we'll use it. */
1230 int ifindex = do_get_ifindex("lo");
1232 VLOG_WARN("failed to get ifindex for lo, "
1233 "obtaining netdev stats from proc");
1236 struct netdev_stats stats;
1237 int error = get_stats_via_netlink(ifindex, &stats);
1239 VLOG_DBG("obtaining netdev stats via rtnetlink");
1242 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1243 "via proc (you are probably running a pre-2.6.19 "
1244 "kernel)", strerror(error));
1251 swap_uint64(uint64_t *a, uint64_t *b)
1259 get_stats_via_vport(const struct netdev *netdev_,
1260 struct netdev_stats *stats)
1262 struct netdev_dev_linux *netdev_dev =
1263 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1265 if (netdev_dev->have_vport_stats ||
1266 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1269 error = netdev_vport_get_stats(netdev_, stats);
1271 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1272 netdev_get_name(netdev_), error);
1274 netdev_dev->have_vport_stats = !error;
1275 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1280 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1281 struct netdev_stats *stats)
1283 static int use_netlink_stats = -1;
1286 if (use_netlink_stats < 0) {
1287 use_netlink_stats = check_for_working_netlink_stats();
1290 if (use_netlink_stats) {
1293 error = get_ifindex(netdev_, &ifindex);
1295 error = get_stats_via_netlink(ifindex, stats);
1298 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1302 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1303 netdev_get_name(netdev_), error);
1309 /* Retrieves current device stats for 'netdev-linux'. */
1311 netdev_linux_get_stats(const struct netdev *netdev_,
1312 struct netdev_stats *stats)
1314 struct netdev_dev_linux *netdev_dev =
1315 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1316 struct netdev_stats dev_stats;
1319 get_stats_via_vport(netdev_, stats);
1321 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1324 if (!netdev_dev->have_vport_stats) {
1331 if (!netdev_dev->have_vport_stats) {
1332 /* stats not available from OVS then use ioctl stats. */
1335 stats->rx_errors += dev_stats.rx_errors;
1336 stats->tx_errors += dev_stats.tx_errors;
1337 stats->rx_dropped += dev_stats.rx_dropped;
1338 stats->tx_dropped += dev_stats.tx_dropped;
1339 stats->multicast += dev_stats.multicast;
1340 stats->collisions += dev_stats.collisions;
1341 stats->rx_length_errors += dev_stats.rx_length_errors;
1342 stats->rx_over_errors += dev_stats.rx_over_errors;
1343 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1344 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1345 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1346 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1347 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1348 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1349 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1350 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1351 stats->tx_window_errors += dev_stats.tx_window_errors;
1356 /* Retrieves current device stats for 'netdev-tap' netdev or
1357 * netdev-internal. */
1359 netdev_pseudo_get_stats(const struct netdev *netdev_,
1360 struct netdev_stats *stats)
1362 struct netdev_dev_linux *netdev_dev =
1363 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1364 struct netdev_stats dev_stats;
1367 get_stats_via_vport(netdev_, stats);
1369 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1371 if (!netdev_dev->have_vport_stats) {
1378 /* If this port is an internal port then the transmit and receive stats
1379 * will appear to be swapped relative to the other ports since we are the
1380 * one sending the data, not a remote computer. For consistency, we swap
1381 * them back here. This does not apply if we are getting stats from the
1382 * vport layer because it always tracks stats from the perspective of the
1384 if (!netdev_dev->have_vport_stats) {
1386 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1387 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1388 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1389 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1390 stats->rx_length_errors = 0;
1391 stats->rx_over_errors = 0;
1392 stats->rx_crc_errors = 0;
1393 stats->rx_frame_errors = 0;
1394 stats->rx_fifo_errors = 0;
1395 stats->rx_missed_errors = 0;
1396 stats->tx_aborted_errors = 0;
1397 stats->tx_carrier_errors = 0;
1398 stats->tx_fifo_errors = 0;
1399 stats->tx_heartbeat_errors = 0;
1400 stats->tx_window_errors = 0;
1402 stats->rx_dropped += dev_stats.tx_dropped;
1403 stats->tx_dropped += dev_stats.rx_dropped;
1405 stats->rx_errors += dev_stats.tx_errors;
1406 stats->tx_errors += dev_stats.rx_errors;
1408 stats->multicast += dev_stats.multicast;
1409 stats->collisions += dev_stats.collisions;
1414 /* Stores the features supported by 'netdev' into each of '*current',
1415 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1416 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1417 * successful, otherwise a positive errno value. */
1419 netdev_linux_get_features(const struct netdev *netdev,
1420 uint32_t *current, uint32_t *advertised,
1421 uint32_t *supported, uint32_t *peer)
1423 struct ethtool_cmd ecmd;
1426 memset(&ecmd, 0, sizeof ecmd);
1427 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1428 ETHTOOL_GSET, "ETHTOOL_GSET");
1433 /* Supported features. */
1435 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1436 *supported |= OFPPF_10MB_HD;
1438 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1439 *supported |= OFPPF_10MB_FD;
1441 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1442 *supported |= OFPPF_100MB_HD;
1444 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1445 *supported |= OFPPF_100MB_FD;
1447 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1448 *supported |= OFPPF_1GB_HD;
1450 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1451 *supported |= OFPPF_1GB_FD;
1453 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1454 *supported |= OFPPF_10GB_FD;
1456 if (ecmd.supported & SUPPORTED_TP) {
1457 *supported |= OFPPF_COPPER;
1459 if (ecmd.supported & SUPPORTED_FIBRE) {
1460 *supported |= OFPPF_FIBER;
1462 if (ecmd.supported & SUPPORTED_Autoneg) {
1463 *supported |= OFPPF_AUTONEG;
1465 if (ecmd.supported & SUPPORTED_Pause) {
1466 *supported |= OFPPF_PAUSE;
1468 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1469 *supported |= OFPPF_PAUSE_ASYM;
1472 /* Advertised features. */
1474 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1475 *advertised |= OFPPF_10MB_HD;
1477 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1478 *advertised |= OFPPF_10MB_FD;
1480 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1481 *advertised |= OFPPF_100MB_HD;
1483 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1484 *advertised |= OFPPF_100MB_FD;
1486 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1487 *advertised |= OFPPF_1GB_HD;
1489 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1490 *advertised |= OFPPF_1GB_FD;
1492 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1493 *advertised |= OFPPF_10GB_FD;
1495 if (ecmd.advertising & ADVERTISED_TP) {
1496 *advertised |= OFPPF_COPPER;
1498 if (ecmd.advertising & ADVERTISED_FIBRE) {
1499 *advertised |= OFPPF_FIBER;
1501 if (ecmd.advertising & ADVERTISED_Autoneg) {
1502 *advertised |= OFPPF_AUTONEG;
1504 if (ecmd.advertising & ADVERTISED_Pause) {
1505 *advertised |= OFPPF_PAUSE;
1507 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1508 *advertised |= OFPPF_PAUSE_ASYM;
1511 /* Current settings. */
1512 if (ecmd.speed == SPEED_10) {
1513 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1514 } else if (ecmd.speed == SPEED_100) {
1515 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1516 } else if (ecmd.speed == SPEED_1000) {
1517 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1518 } else if (ecmd.speed == SPEED_10000) {
1519 *current = OFPPF_10GB_FD;
1524 if (ecmd.port == PORT_TP) {
1525 *current |= OFPPF_COPPER;
1526 } else if (ecmd.port == PORT_FIBRE) {
1527 *current |= OFPPF_FIBER;
1531 *current |= OFPPF_AUTONEG;
1534 /* Peer advertisements. */
1535 *peer = 0; /* XXX */
1540 /* Set the features advertised by 'netdev' to 'advertise'. */
1542 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1544 struct ethtool_cmd ecmd;
1547 memset(&ecmd, 0, sizeof ecmd);
1548 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1549 ETHTOOL_GSET, "ETHTOOL_GSET");
1554 ecmd.advertising = 0;
1555 if (advertise & OFPPF_10MB_HD) {
1556 ecmd.advertising |= ADVERTISED_10baseT_Half;
1558 if (advertise & OFPPF_10MB_FD) {
1559 ecmd.advertising |= ADVERTISED_10baseT_Full;
1561 if (advertise & OFPPF_100MB_HD) {
1562 ecmd.advertising |= ADVERTISED_100baseT_Half;
1564 if (advertise & OFPPF_100MB_FD) {
1565 ecmd.advertising |= ADVERTISED_100baseT_Full;
1567 if (advertise & OFPPF_1GB_HD) {
1568 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1570 if (advertise & OFPPF_1GB_FD) {
1571 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1573 if (advertise & OFPPF_10GB_FD) {
1574 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1576 if (advertise & OFPPF_COPPER) {
1577 ecmd.advertising |= ADVERTISED_TP;
1579 if (advertise & OFPPF_FIBER) {
1580 ecmd.advertising |= ADVERTISED_FIBRE;
1582 if (advertise & OFPPF_AUTONEG) {
1583 ecmd.advertising |= ADVERTISED_Autoneg;
1585 if (advertise & OFPPF_PAUSE) {
1586 ecmd.advertising |= ADVERTISED_Pause;
1588 if (advertise & OFPPF_PAUSE_ASYM) {
1589 ecmd.advertising |= ADVERTISED_Asym_Pause;
1591 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1592 ETHTOOL_SSET, "ETHTOOL_SSET");
1595 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1596 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1597 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1598 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1599 * sets '*vlan_vid' to -1. */
1601 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1603 const char *netdev_name = netdev_get_name(netdev);
1604 struct ds line = DS_EMPTY_INITIALIZER;
1605 FILE *stream = NULL;
1609 COVERAGE_INC(netdev_get_vlan_vid);
1610 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1611 stream = fopen(fn, "r");
1617 if (ds_get_line(&line, stream)) {
1618 if (ferror(stream)) {
1620 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1623 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1628 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1630 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1631 fn, ds_cstr(&line));
1649 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1650 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1652 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1653 * positive errno value.
1655 * This function is equivalent to running
1656 * /sbin/tc qdisc del dev %s handle ffff: ingress
1657 * but it is much, much faster.
1660 netdev_linux_remove_policing(struct netdev *netdev)
1662 struct netdev_dev_linux *netdev_dev =
1663 netdev_dev_linux_cast(netdev_get_dev(netdev));
1664 const char *netdev_name = netdev_get_name(netdev);
1666 struct ofpbuf request;
1667 struct tcmsg *tcmsg;
1670 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1674 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1675 tcmsg->tcm_parent = TC_H_INGRESS;
1676 nl_msg_put_string(&request, TCA_KIND, "ingress");
1677 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1679 error = tc_transact(&request, NULL);
1680 if (error && error != ENOENT && error != EINVAL) {
1681 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1682 netdev_name, strerror(error));
1686 netdev_dev->kbits_rate = 0;
1687 netdev_dev->kbits_burst = 0;
1688 netdev_dev->cache_valid |= VALID_POLICING;
1692 /* Attempts to set input rate limiting (policing) policy. */
1694 netdev_linux_set_policing(struct netdev *netdev,
1695 uint32_t kbits_rate, uint32_t kbits_burst)
1697 struct netdev_dev_linux *netdev_dev =
1698 netdev_dev_linux_cast(netdev_get_dev(netdev));
1699 const char *netdev_name = netdev_get_name(netdev);
1702 COVERAGE_INC(netdev_set_policing);
1704 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1705 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1706 : kbits_burst); /* Stick with user-specified value. */
1708 if (netdev_dev->cache_valid & VALID_POLICING
1709 && netdev_dev->kbits_rate == kbits_rate
1710 && netdev_dev->kbits_burst == kbits_burst) {
1711 /* Assume that settings haven't changed since we last set them. */
1715 netdev_linux_remove_policing(netdev);
1717 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1718 if (system(command) != 0) {
1719 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1723 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1724 kbits_rate, kbits_burst);
1725 if (system(command) != 0) {
1726 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1731 netdev_dev->kbits_rate = kbits_rate;
1732 netdev_dev->kbits_burst = kbits_burst;
1733 netdev_dev->cache_valid |= VALID_POLICING;
1740 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1743 const struct tc_ops **opsp;
1745 for (opsp = tcs; *opsp != NULL; opsp++) {
1746 const struct tc_ops *ops = *opsp;
1747 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1748 sset_add(types, ops->ovs_name);
1754 static const struct tc_ops *
1755 tc_lookup_ovs_name(const char *name)
1757 const struct tc_ops **opsp;
1759 for (opsp = tcs; *opsp != NULL; opsp++) {
1760 const struct tc_ops *ops = *opsp;
1761 if (!strcmp(name, ops->ovs_name)) {
1768 static const struct tc_ops *
1769 tc_lookup_linux_name(const char *name)
1771 const struct tc_ops **opsp;
1773 for (opsp = tcs; *opsp != NULL; opsp++) {
1774 const struct tc_ops *ops = *opsp;
1775 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1782 static struct tc_queue *
1783 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1786 struct netdev_dev_linux *netdev_dev =
1787 netdev_dev_linux_cast(netdev_get_dev(netdev));
1788 struct tc_queue *queue;
1790 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1791 if (queue->queue_id == queue_id) {
1798 static struct tc_queue *
1799 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1801 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1805 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1807 struct netdev_qos_capabilities *caps)
1809 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1813 caps->n_queues = ops->n_queues;
1818 netdev_linux_get_qos(const struct netdev *netdev,
1819 const char **typep, struct shash *details)
1821 struct netdev_dev_linux *netdev_dev =
1822 netdev_dev_linux_cast(netdev_get_dev(netdev));
1825 error = tc_query_qdisc(netdev);
1830 *typep = netdev_dev->tc->ops->ovs_name;
1831 return (netdev_dev->tc->ops->qdisc_get
1832 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1837 netdev_linux_set_qos(struct netdev *netdev,
1838 const char *type, const struct shash *details)
1840 struct netdev_dev_linux *netdev_dev =
1841 netdev_dev_linux_cast(netdev_get_dev(netdev));
1842 const struct tc_ops *new_ops;
1845 new_ops = tc_lookup_ovs_name(type);
1846 if (!new_ops || !new_ops->tc_install) {
1850 error = tc_query_qdisc(netdev);
1855 if (new_ops == netdev_dev->tc->ops) {
1856 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1858 /* Delete existing qdisc. */
1859 error = tc_del_qdisc(netdev);
1863 assert(netdev_dev->tc == NULL);
1865 /* Install new qdisc. */
1866 error = new_ops->tc_install(netdev, details);
1867 assert((error == 0) == (netdev_dev->tc != NULL));
1874 netdev_linux_get_queue(const struct netdev *netdev,
1875 unsigned int queue_id, struct shash *details)
1877 struct netdev_dev_linux *netdev_dev =
1878 netdev_dev_linux_cast(netdev_get_dev(netdev));
1881 error = tc_query_qdisc(netdev);
1885 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1887 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1893 netdev_linux_set_queue(struct netdev *netdev,
1894 unsigned int queue_id, const struct shash *details)
1896 struct netdev_dev_linux *netdev_dev =
1897 netdev_dev_linux_cast(netdev_get_dev(netdev));
1900 error = tc_query_qdisc(netdev);
1903 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1904 || !netdev_dev->tc->ops->class_set) {
1908 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1912 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1914 struct netdev_dev_linux *netdev_dev =
1915 netdev_dev_linux_cast(netdev_get_dev(netdev));
1918 error = tc_query_qdisc(netdev);
1921 } else if (!netdev_dev->tc->ops->class_delete) {
1924 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1926 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1932 netdev_linux_get_queue_stats(const struct netdev *netdev,
1933 unsigned int queue_id,
1934 struct netdev_queue_stats *stats)
1936 struct netdev_dev_linux *netdev_dev =
1937 netdev_dev_linux_cast(netdev_get_dev(netdev));
1940 error = tc_query_qdisc(netdev);
1943 } else if (!netdev_dev->tc->ops->class_get_stats) {
1946 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1948 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1954 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1956 struct ofpbuf request;
1957 struct tcmsg *tcmsg;
1959 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1963 tcmsg->tcm_parent = 0;
1964 nl_dump_start(dump, rtnl_sock, &request);
1965 ofpbuf_uninit(&request);
1970 netdev_linux_dump_queues(const struct netdev *netdev,
1971 netdev_dump_queues_cb *cb, void *aux)
1973 struct netdev_dev_linux *netdev_dev =
1974 netdev_dev_linux_cast(netdev_get_dev(netdev));
1975 struct tc_queue *queue;
1976 struct shash details;
1980 error = tc_query_qdisc(netdev);
1983 } else if (!netdev_dev->tc->ops->class_get) {
1988 shash_init(&details);
1989 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1990 shash_clear(&details);
1992 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1994 (*cb)(queue->queue_id, &details, aux);
1999 shash_destroy(&details);
2005 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2006 netdev_dump_queue_stats_cb *cb, void *aux)
2008 struct netdev_dev_linux *netdev_dev =
2009 netdev_dev_linux_cast(netdev_get_dev(netdev));
2010 struct nl_dump dump;
2015 error = tc_query_qdisc(netdev);
2018 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2023 if (!start_queue_dump(netdev, &dump)) {
2026 while (nl_dump_next(&dump, &msg)) {
2027 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2033 error = nl_dump_done(&dump);
2034 return error ? error : last_error;
2038 netdev_linux_get_in4(const struct netdev *netdev_,
2039 struct in_addr *address, struct in_addr *netmask)
2041 struct netdev_dev_linux *netdev_dev =
2042 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2044 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2047 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2048 SIOCGIFADDR, "SIOCGIFADDR");
2053 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2054 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2059 netdev_dev->cache_valid |= VALID_IN4;
2061 *address = netdev_dev->address;
2062 *netmask = netdev_dev->netmask;
2063 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2067 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2068 struct in_addr netmask)
2070 struct netdev_dev_linux *netdev_dev =
2071 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2074 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2076 netdev_dev->cache_valid |= VALID_IN4;
2077 netdev_dev->address = address;
2078 netdev_dev->netmask = netmask;
2079 if (address.s_addr != INADDR_ANY) {
2080 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2081 "SIOCSIFNETMASK", netmask);
2088 parse_if_inet6_line(const char *line,
2089 struct in6_addr *in6, char ifname[16 + 1])
2091 uint8_t *s6 = in6->s6_addr;
2092 #define X8 "%2"SCNx8
2094 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2095 "%*x %*x %*x %*x %16s\n",
2096 &s6[0], &s6[1], &s6[2], &s6[3],
2097 &s6[4], &s6[5], &s6[6], &s6[7],
2098 &s6[8], &s6[9], &s6[10], &s6[11],
2099 &s6[12], &s6[13], &s6[14], &s6[15],
2103 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2104 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2106 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2108 struct netdev_dev_linux *netdev_dev =
2109 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2110 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2114 netdev_dev->in6 = in6addr_any;
2116 file = fopen("/proc/net/if_inet6", "r");
2118 const char *name = netdev_get_name(netdev_);
2119 while (fgets(line, sizeof line, file)) {
2120 struct in6_addr in6_tmp;
2121 char ifname[16 + 1];
2122 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2123 && !strcmp(name, ifname))
2125 netdev_dev->in6 = in6_tmp;
2131 netdev_dev->cache_valid |= VALID_IN6;
2133 *in6 = netdev_dev->in6;
2138 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2140 struct sockaddr_in sin;
2141 memset(&sin, 0, sizeof sin);
2142 sin.sin_family = AF_INET;
2143 sin.sin_addr = addr;
2146 memset(sa, 0, sizeof *sa);
2147 memcpy(sa, &sin, sizeof sin);
2151 do_set_addr(struct netdev *netdev,
2152 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2155 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2156 make_in4_sockaddr(&ifr.ifr_addr, addr);
2158 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2162 /* Adds 'router' as a default IP gateway. */
2164 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2166 struct in_addr any = { INADDR_ANY };
2170 memset(&rt, 0, sizeof rt);
2171 make_in4_sockaddr(&rt.rt_dst, any);
2172 make_in4_sockaddr(&rt.rt_gateway, router);
2173 make_in4_sockaddr(&rt.rt_genmask, any);
2174 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2175 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2177 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2183 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2186 static const char fn[] = "/proc/net/route";
2191 *netdev_name = NULL;
2192 stream = fopen(fn, "r");
2193 if (stream == NULL) {
2194 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2199 while (fgets(line, sizeof line, stream)) {
2202 ovs_be32 dest, gateway, mask;
2203 int refcnt, metric, mtu;
2204 unsigned int flags, use, window, irtt;
2207 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2209 iface, &dest, &gateway, &flags, &refcnt,
2210 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2212 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2216 if (!(flags & RTF_UP)) {
2217 /* Skip routes that aren't up. */
2221 /* The output of 'dest', 'mask', and 'gateway' were given in
2222 * network byte order, so we don't need need any endian
2223 * conversions here. */
2224 if ((dest & mask) == (host->s_addr & mask)) {
2226 /* The host is directly reachable. */
2227 next_hop->s_addr = 0;
2229 /* To reach the host, we must go through a gateway. */
2230 next_hop->s_addr = gateway;
2232 *netdev_name = xstrdup(iface);
2244 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2246 struct ethtool_drvinfo drvinfo;
2249 memset(&drvinfo, 0, sizeof drvinfo);
2250 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2251 (struct ethtool_cmd *)&drvinfo,
2253 "ETHTOOL_GDRVINFO");
2255 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2256 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2257 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2263 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2264 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2265 * returns 0. Otherwise, it returns a positive errno value; in particular,
2266 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2268 netdev_linux_arp_lookup(const struct netdev *netdev,
2269 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2272 struct sockaddr_in sin;
2275 memset(&r, 0, sizeof r);
2276 memset(&sin, 0, sizeof sin);
2277 sin.sin_family = AF_INET;
2278 sin.sin_addr.s_addr = ip;
2280 memcpy(&r.arp_pa, &sin, sizeof sin);
2281 r.arp_ha.sa_family = ARPHRD_ETHER;
2283 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2284 COVERAGE_INC(netdev_arp_lookup);
2285 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2287 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2288 } else if (retval != ENXIO) {
2289 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2290 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2296 nd_to_iff_flags(enum netdev_flags nd)
2299 if (nd & NETDEV_UP) {
2302 if (nd & NETDEV_PROMISC) {
2309 iff_to_nd_flags(int iff)
2311 enum netdev_flags nd = 0;
2315 if (iff & IFF_PROMISC) {
2316 nd |= NETDEV_PROMISC;
2322 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2323 enum netdev_flags on, enum netdev_flags *old_flagsp)
2325 int old_flags, new_flags;
2328 error = get_flags(netdev, &old_flags);
2330 *old_flagsp = iff_to_nd_flags(old_flags);
2331 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2332 if (new_flags != old_flags) {
2333 error = set_flags(netdev, new_flags);
2340 netdev_linux_change_seq(const struct netdev *netdev)
2342 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2345 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, GET_STATS, SET_STATS) \
2349 netdev_linux_init, \
2351 netdev_linux_wait, \
2354 netdev_linux_destroy, \
2355 NULL, /* get_config */ \
2356 NULL, /* set_config */ \
2358 netdev_linux_open, \
2359 netdev_linux_close, \
2363 netdev_linux_listen, \
2364 netdev_linux_recv, \
2365 netdev_linux_recv_wait, \
2366 netdev_linux_drain, \
2368 netdev_linux_send, \
2369 netdev_linux_send_wait, \
2371 netdev_linux_set_etheraddr, \
2372 netdev_linux_get_etheraddr, \
2373 netdev_linux_get_mtu, \
2374 netdev_linux_set_mtu, \
2375 netdev_linux_get_ifindex, \
2376 netdev_linux_get_carrier, \
2377 netdev_linux_set_miimon_interval, \
2381 netdev_linux_get_features, \
2382 netdev_linux_set_advertisements, \
2383 netdev_linux_get_vlan_vid, \
2385 netdev_linux_set_policing, \
2386 netdev_linux_get_qos_types, \
2387 netdev_linux_get_qos_capabilities, \
2388 netdev_linux_get_qos, \
2389 netdev_linux_set_qos, \
2390 netdev_linux_get_queue, \
2391 netdev_linux_set_queue, \
2392 netdev_linux_delete_queue, \
2393 netdev_linux_get_queue_stats, \
2394 netdev_linux_dump_queues, \
2395 netdev_linux_dump_queue_stats, \
2397 netdev_linux_get_in4, \
2398 netdev_linux_set_in4, \
2399 netdev_linux_get_in6, \
2400 netdev_linux_add_router, \
2401 netdev_linux_get_next_hop, \
2402 netdev_linux_get_status, \
2403 netdev_linux_arp_lookup, \
2405 netdev_linux_update_flags, \
2407 netdev_linux_change_seq \
2410 const struct netdev_class netdev_linux_class =
2413 netdev_linux_create,
2414 netdev_linux_enumerate,
2415 netdev_linux_get_stats,
2416 NULL); /* set_stats */
2418 const struct netdev_class netdev_tap_class =
2421 netdev_linux_create_tap,
2422 NULL, /* enumerate */
2423 netdev_pseudo_get_stats,
2424 NULL); /* set_stats */
2426 const struct netdev_class netdev_internal_class =
2429 netdev_linux_create,
2430 NULL, /* enumerate */
2431 netdev_pseudo_get_stats,
2432 netdev_vport_set_stats);
2434 /* HTB traffic control class. */
2436 #define HTB_N_QUEUES 0xf000
2440 unsigned int max_rate; /* In bytes/s. */
2444 struct tc_queue tc_queue;
2445 unsigned int min_rate; /* In bytes/s. */
2446 unsigned int max_rate; /* In bytes/s. */
2447 unsigned int burst; /* In bytes. */
2448 unsigned int priority; /* Lower values are higher priorities. */
2452 htb_get__(const struct netdev *netdev)
2454 struct netdev_dev_linux *netdev_dev =
2455 netdev_dev_linux_cast(netdev_get_dev(netdev));
2456 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2460 htb_install__(struct netdev *netdev, uint64_t max_rate)
2462 struct netdev_dev_linux *netdev_dev =
2463 netdev_dev_linux_cast(netdev_get_dev(netdev));
2466 htb = xmalloc(sizeof *htb);
2467 tc_init(&htb->tc, &tc_ops_htb);
2468 htb->max_rate = max_rate;
2470 netdev_dev->tc = &htb->tc;
2473 /* Create an HTB qdisc.
2475 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2477 htb_setup_qdisc__(struct netdev *netdev)
2480 struct tc_htb_glob opt;
2481 struct ofpbuf request;
2482 struct tcmsg *tcmsg;
2484 tc_del_qdisc(netdev);
2486 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2487 NLM_F_EXCL | NLM_F_CREATE, &request);
2491 tcmsg->tcm_handle = tc_make_handle(1, 0);
2492 tcmsg->tcm_parent = TC_H_ROOT;
2494 nl_msg_put_string(&request, TCA_KIND, "htb");
2496 memset(&opt, 0, sizeof opt);
2497 opt.rate2quantum = 10;
2501 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2502 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2503 nl_msg_end_nested(&request, opt_offset);
2505 return tc_transact(&request, NULL);
2508 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2509 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2511 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2512 unsigned int parent, struct htb_class *class)
2515 struct tc_htb_opt opt;
2516 struct ofpbuf request;
2517 struct tcmsg *tcmsg;
2521 error = netdev_get_mtu(netdev, &mtu);
2523 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2524 netdev_get_name(netdev));
2528 memset(&opt, 0, sizeof opt);
2529 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2530 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2531 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2532 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2533 opt.prio = class->priority;
2535 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2539 tcmsg->tcm_handle = handle;
2540 tcmsg->tcm_parent = parent;
2542 nl_msg_put_string(&request, TCA_KIND, "htb");
2543 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2544 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2545 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2546 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2547 nl_msg_end_nested(&request, opt_offset);
2549 error = tc_transact(&request, NULL);
2551 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2552 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2553 netdev_get_name(netdev),
2554 tc_get_major(handle), tc_get_minor(handle),
2555 tc_get_major(parent), tc_get_minor(parent),
2556 class->min_rate, class->max_rate,
2557 class->burst, class->priority, strerror(error));
2562 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2563 * description of them into 'details'. The description complies with the
2564 * specification given in the vswitch database documentation for linux-htb
2567 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2569 static const struct nl_policy tca_htb_policy[] = {
2570 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2571 .min_len = sizeof(struct tc_htb_opt) },
2574 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2575 const struct tc_htb_opt *htb;
2577 if (!nl_parse_nested(nl_options, tca_htb_policy,
2578 attrs, ARRAY_SIZE(tca_htb_policy))) {
2579 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2583 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2584 class->min_rate = htb->rate.rate;
2585 class->max_rate = htb->ceil.rate;
2586 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2587 class->priority = htb->prio;
2592 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2593 struct htb_class *options,
2594 struct netdev_queue_stats *stats)
2596 struct nlattr *nl_options;
2597 unsigned int handle;
2600 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2601 if (!error && queue_id) {
2602 unsigned int major = tc_get_major(handle);
2603 unsigned int minor = tc_get_minor(handle);
2604 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2605 *queue_id = minor - 1;
2610 if (!error && options) {
2611 error = htb_parse_tca_options__(nl_options, options);
2617 htb_parse_qdisc_details__(struct netdev *netdev,
2618 const struct shash *details, struct htb_class *hc)
2620 const char *max_rate_s;
2622 max_rate_s = shash_find_data(details, "max-rate");
2623 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2624 if (!hc->max_rate) {
2627 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2628 hc->max_rate = netdev_features_to_bps(current) / 8;
2630 hc->min_rate = hc->max_rate;
2636 htb_parse_class_details__(struct netdev *netdev,
2637 const struct shash *details, struct htb_class *hc)
2639 const struct htb *htb = htb_get__(netdev);
2640 const char *min_rate_s = shash_find_data(details, "min-rate");
2641 const char *max_rate_s = shash_find_data(details, "max-rate");
2642 const char *burst_s = shash_find_data(details, "burst");
2643 const char *priority_s = shash_find_data(details, "priority");
2646 error = netdev_get_mtu(netdev, &mtu);
2648 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2649 netdev_get_name(netdev));
2653 /* HTB requires at least an mtu sized min-rate to send any traffic even
2654 * on uncongested links. */
2655 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2656 hc->min_rate = MAX(hc->min_rate, mtu);
2657 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2660 hc->max_rate = (max_rate_s
2661 ? strtoull(max_rate_s, NULL, 10) / 8
2663 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2664 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2668 * According to hints in the documentation that I've read, it is important
2669 * that 'burst' be at least as big as the largest frame that might be
2670 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2671 * but having it a bit too small is a problem. Since netdev_get_mtu()
2672 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2673 * the MTU. We actually add 64, instead of 14, as a guard against
2674 * additional headers get tacked on somewhere that we're not aware of. */
2675 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2676 hc->burst = MAX(hc->burst, mtu + 64);
2679 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2685 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2686 unsigned int parent, struct htb_class *options,
2687 struct netdev_queue_stats *stats)
2689 struct ofpbuf *reply;
2692 error = tc_query_class(netdev, handle, parent, &reply);
2694 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2695 ofpbuf_delete(reply);
2701 htb_tc_install(struct netdev *netdev, const struct shash *details)
2705 error = htb_setup_qdisc__(netdev);
2707 struct htb_class hc;
2709 htb_parse_qdisc_details__(netdev, details, &hc);
2710 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2711 tc_make_handle(1, 0), &hc);
2713 htb_install__(netdev, hc.max_rate);
2719 static struct htb_class *
2720 htb_class_cast__(const struct tc_queue *queue)
2722 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2726 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2727 const struct htb_class *hc)
2729 struct htb *htb = htb_get__(netdev);
2730 size_t hash = hash_int(queue_id, 0);
2731 struct tc_queue *queue;
2732 struct htb_class *hcp;
2734 queue = tc_find_queue__(netdev, queue_id, hash);
2736 hcp = htb_class_cast__(queue);
2738 hcp = xmalloc(sizeof *hcp);
2739 queue = &hcp->tc_queue;
2740 queue->queue_id = queue_id;
2741 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2744 hcp->min_rate = hc->min_rate;
2745 hcp->max_rate = hc->max_rate;
2746 hcp->burst = hc->burst;
2747 hcp->priority = hc->priority;
2751 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2754 struct nl_dump dump;
2755 struct htb_class hc;
2757 /* Get qdisc options. */
2759 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2760 htb_install__(netdev, hc.max_rate);
2763 if (!start_queue_dump(netdev, &dump)) {
2766 while (nl_dump_next(&dump, &msg)) {
2767 unsigned int queue_id;
2769 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2770 htb_update_queue__(netdev, queue_id, &hc);
2773 nl_dump_done(&dump);
2779 htb_tc_destroy(struct tc *tc)
2781 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2782 struct htb_class *hc, *next;
2784 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2785 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2793 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2795 const struct htb *htb = htb_get__(netdev);
2796 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2801 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2803 struct htb_class hc;
2806 htb_parse_qdisc_details__(netdev, details, &hc);
2807 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2808 tc_make_handle(1, 0), &hc);
2810 htb_get__(netdev)->max_rate = hc.max_rate;
2816 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2817 const struct tc_queue *queue, struct shash *details)
2819 const struct htb_class *hc = htb_class_cast__(queue);
2821 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2822 if (hc->min_rate != hc->max_rate) {
2823 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2825 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2827 shash_add(details, "priority", xasprintf("%u", hc->priority));
2833 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2834 const struct shash *details)
2836 struct htb_class hc;
2839 error = htb_parse_class_details__(netdev, details, &hc);
2844 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2845 tc_make_handle(1, 0xfffe), &hc);
2850 htb_update_queue__(netdev, queue_id, &hc);
2855 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2857 struct htb_class *hc = htb_class_cast__(queue);
2858 struct htb *htb = htb_get__(netdev);
2861 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2863 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2870 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2871 struct netdev_queue_stats *stats)
2873 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2874 tc_make_handle(1, 0xfffe), NULL, stats);
2878 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2879 const struct ofpbuf *nlmsg,
2880 netdev_dump_queue_stats_cb *cb, void *aux)
2882 struct netdev_queue_stats stats;
2883 unsigned int handle, major, minor;
2886 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2891 major = tc_get_major(handle);
2892 minor = tc_get_minor(handle);
2893 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2894 (*cb)(minor - 1, &stats, aux);
2899 static const struct tc_ops tc_ops_htb = {
2900 "htb", /* linux_name */
2901 "linux-htb", /* ovs_name */
2902 HTB_N_QUEUES, /* n_queues */
2911 htb_class_get_stats,
2912 htb_class_dump_stats
2915 /* "linux-hfsc" traffic control class. */
2917 #define HFSC_N_QUEUES 0xf000
2925 struct tc_queue tc_queue;
2930 static struct hfsc *
2931 hfsc_get__(const struct netdev *netdev)
2933 struct netdev_dev_linux *netdev_dev;
2934 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2935 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2938 static struct hfsc_class *
2939 hfsc_class_cast__(const struct tc_queue *queue)
2941 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2945 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2947 struct netdev_dev_linux * netdev_dev;
2950 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2951 hfsc = xmalloc(sizeof *hfsc);
2952 tc_init(&hfsc->tc, &tc_ops_hfsc);
2953 hfsc->max_rate = max_rate;
2954 netdev_dev->tc = &hfsc->tc;
2958 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2959 const struct hfsc_class *hc)
2963 struct hfsc_class *hcp;
2964 struct tc_queue *queue;
2966 hfsc = hfsc_get__(netdev);
2967 hash = hash_int(queue_id, 0);
2969 queue = tc_find_queue__(netdev, queue_id, hash);
2971 hcp = hfsc_class_cast__(queue);
2973 hcp = xmalloc(sizeof *hcp);
2974 queue = &hcp->tc_queue;
2975 queue->queue_id = queue_id;
2976 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2979 hcp->min_rate = hc->min_rate;
2980 hcp->max_rate = hc->max_rate;
2984 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2986 const struct tc_service_curve *rsc, *fsc, *usc;
2987 static const struct nl_policy tca_hfsc_policy[] = {
2989 .type = NL_A_UNSPEC,
2991 .min_len = sizeof(struct tc_service_curve),
2994 .type = NL_A_UNSPEC,
2996 .min_len = sizeof(struct tc_service_curve),
2999 .type = NL_A_UNSPEC,
3001 .min_len = sizeof(struct tc_service_curve),
3004 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3006 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3007 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3008 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3012 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3013 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3014 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3016 if (rsc->m1 != 0 || rsc->d != 0 ||
3017 fsc->m1 != 0 || fsc->d != 0 ||
3018 usc->m1 != 0 || usc->d != 0) {
3019 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3020 "Non-linear service curves are not supported.");
3024 if (rsc->m2 != fsc->m2) {
3025 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3026 "Real-time service curves are not supported ");
3030 if (rsc->m2 > usc->m2) {
3031 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3032 "Min-rate service curve is greater than "
3033 "the max-rate service curve.");
3037 class->min_rate = fsc->m2;
3038 class->max_rate = usc->m2;
3043 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3044 struct hfsc_class *options,
3045 struct netdev_queue_stats *stats)
3048 unsigned int handle;
3049 struct nlattr *nl_options;
3051 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3057 unsigned int major, minor;
3059 major = tc_get_major(handle);
3060 minor = tc_get_minor(handle);
3061 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3062 *queue_id = minor - 1;
3069 error = hfsc_parse_tca_options__(nl_options, options);
3076 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3077 unsigned int parent, struct hfsc_class *options,
3078 struct netdev_queue_stats *stats)
3081 struct ofpbuf *reply;
3083 error = tc_query_class(netdev, handle, parent, &reply);
3088 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3089 ofpbuf_delete(reply);
3094 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3095 struct hfsc_class *class)
3098 const char *max_rate_s;
3100 max_rate_s = shash_find_data(details, "max-rate");
3101 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3106 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3107 max_rate = netdev_features_to_bps(current) / 8;
3110 class->min_rate = max_rate;
3111 class->max_rate = max_rate;
3115 hfsc_parse_class_details__(struct netdev *netdev,
3116 const struct shash *details,
3117 struct hfsc_class * class)
3119 const struct hfsc *hfsc;
3120 uint32_t min_rate, max_rate;
3121 const char *min_rate_s, *max_rate_s;
3123 hfsc = hfsc_get__(netdev);
3124 min_rate_s = shash_find_data(details, "min-rate");
3125 max_rate_s = shash_find_data(details, "max-rate");
3127 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3128 min_rate = MAX(min_rate, 1);
3129 min_rate = MIN(min_rate, hfsc->max_rate);
3131 max_rate = (max_rate_s
3132 ? strtoull(max_rate_s, NULL, 10) / 8
3134 max_rate = MAX(max_rate, min_rate);
3135 max_rate = MIN(max_rate, hfsc->max_rate);
3137 class->min_rate = min_rate;
3138 class->max_rate = max_rate;
3143 /* Create an HFSC qdisc.
3145 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3147 hfsc_setup_qdisc__(struct netdev * netdev)
3149 struct tcmsg *tcmsg;
3150 struct ofpbuf request;
3151 struct tc_hfsc_qopt opt;
3153 tc_del_qdisc(netdev);
3155 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3156 NLM_F_EXCL | NLM_F_CREATE, &request);
3162 tcmsg->tcm_handle = tc_make_handle(1, 0);
3163 tcmsg->tcm_parent = TC_H_ROOT;
3165 memset(&opt, 0, sizeof opt);
3168 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3169 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3171 return tc_transact(&request, NULL);
3174 /* Create an HFSC class.
3176 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3177 * sc rate <min_rate> ul rate <max_rate>" */
3179 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3180 unsigned int parent, struct hfsc_class *class)
3184 struct tcmsg *tcmsg;
3185 struct ofpbuf request;
3186 struct tc_service_curve min, max;
3188 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3194 tcmsg->tcm_handle = handle;
3195 tcmsg->tcm_parent = parent;
3199 min.m2 = class->min_rate;
3203 max.m2 = class->max_rate;
3205 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3206 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3207 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3208 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3209 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3210 nl_msg_end_nested(&request, opt_offset);
3212 error = tc_transact(&request, NULL);
3214 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3215 "min-rate %ubps, max-rate %ubps (%s)",
3216 netdev_get_name(netdev),
3217 tc_get_major(handle), tc_get_minor(handle),
3218 tc_get_major(parent), tc_get_minor(parent),
3219 class->min_rate, class->max_rate, strerror(error));
3226 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3229 struct hfsc_class class;
3231 error = hfsc_setup_qdisc__(netdev);
3237 hfsc_parse_qdisc_details__(netdev, details, &class);
3238 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3239 tc_make_handle(1, 0), &class);
3245 hfsc_install__(netdev, class.max_rate);
3250 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3253 struct nl_dump dump;
3254 struct hfsc_class hc;
3257 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3258 hfsc_install__(netdev, hc.max_rate);
3260 if (!start_queue_dump(netdev, &dump)) {
3264 while (nl_dump_next(&dump, &msg)) {
3265 unsigned int queue_id;
3267 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3268 hfsc_update_queue__(netdev, queue_id, &hc);
3272 nl_dump_done(&dump);
3277 hfsc_tc_destroy(struct tc *tc)
3280 struct hfsc_class *hc, *next;
3282 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3284 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3285 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3294 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3296 const struct hfsc *hfsc;
3297 hfsc = hfsc_get__(netdev);
3298 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3303 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3306 struct hfsc_class class;
3308 hfsc_parse_qdisc_details__(netdev, details, &class);
3309 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3310 tc_make_handle(1, 0), &class);
3313 hfsc_get__(netdev)->max_rate = class.max_rate;
3320 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3321 const struct tc_queue *queue, struct shash *details)
3323 const struct hfsc_class *hc;
3325 hc = hfsc_class_cast__(queue);
3326 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3327 if (hc->min_rate != hc->max_rate) {
3328 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3334 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3335 const struct shash *details)
3338 struct hfsc_class class;
3340 error = hfsc_parse_class_details__(netdev, details, &class);
3345 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3346 tc_make_handle(1, 0xfffe), &class);
3351 hfsc_update_queue__(netdev, queue_id, &class);
3356 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3360 struct hfsc_class *hc;
3362 hc = hfsc_class_cast__(queue);
3363 hfsc = hfsc_get__(netdev);
3365 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3367 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3374 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3375 struct netdev_queue_stats *stats)
3377 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3378 tc_make_handle(1, 0xfffe), NULL, stats);
3382 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3383 const struct ofpbuf *nlmsg,
3384 netdev_dump_queue_stats_cb *cb, void *aux)
3386 struct netdev_queue_stats stats;
3387 unsigned int handle, major, minor;
3390 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3395 major = tc_get_major(handle);
3396 minor = tc_get_minor(handle);
3397 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3398 (*cb)(minor - 1, &stats, aux);
3403 static const struct tc_ops tc_ops_hfsc = {
3404 "hfsc", /* linux_name */
3405 "linux-hfsc", /* ovs_name */
3406 HFSC_N_QUEUES, /* n_queues */
3407 hfsc_tc_install, /* tc_install */
3408 hfsc_tc_load, /* tc_load */
3409 hfsc_tc_destroy, /* tc_destroy */
3410 hfsc_qdisc_get, /* qdisc_get */
3411 hfsc_qdisc_set, /* qdisc_set */
3412 hfsc_class_get, /* class_get */
3413 hfsc_class_set, /* class_set */
3414 hfsc_class_delete, /* class_delete */
3415 hfsc_class_get_stats, /* class_get_stats */
3416 hfsc_class_dump_stats /* class_dump_stats */
3419 /* "linux-default" traffic control class.
3421 * This class represents the default, unnamed Linux qdisc. It corresponds to
3422 * the "" (empty string) QoS type in the OVS database. */
3425 default_install__(struct netdev *netdev)
3427 struct netdev_dev_linux *netdev_dev =
3428 netdev_dev_linux_cast(netdev_get_dev(netdev));
3429 static struct tc *tc;
3432 tc = xmalloc(sizeof *tc);
3433 tc_init(tc, &tc_ops_default);
3435 netdev_dev->tc = tc;
3439 default_tc_install(struct netdev *netdev,
3440 const struct shash *details OVS_UNUSED)
3442 default_install__(netdev);
3447 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3449 default_install__(netdev);
3453 static const struct tc_ops tc_ops_default = {
3454 NULL, /* linux_name */
3459 NULL, /* tc_destroy */
3460 NULL, /* qdisc_get */
3461 NULL, /* qdisc_set */
3462 NULL, /* class_get */
3463 NULL, /* class_set */
3464 NULL, /* class_delete */
3465 NULL, /* class_get_stats */
3466 NULL /* class_dump_stats */
3469 /* "linux-other" traffic control class.
3474 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3476 struct netdev_dev_linux *netdev_dev =
3477 netdev_dev_linux_cast(netdev_get_dev(netdev));
3478 static struct tc *tc;
3481 tc = xmalloc(sizeof *tc);
3482 tc_init(tc, &tc_ops_other);
3484 netdev_dev->tc = tc;
3488 static const struct tc_ops tc_ops_other = {
3489 NULL, /* linux_name */
3490 "linux-other", /* ovs_name */
3492 NULL, /* tc_install */
3494 NULL, /* tc_destroy */
3495 NULL, /* qdisc_get */
3496 NULL, /* qdisc_set */
3497 NULL, /* class_get */
3498 NULL, /* class_set */
3499 NULL, /* class_delete */
3500 NULL, /* class_get_stats */
3501 NULL /* class_dump_stats */
3504 /* Traffic control. */
3506 /* Number of kernel "tc" ticks per second. */
3507 static double ticks_per_s;
3509 /* Number of kernel "jiffies" per second. This is used for the purpose of
3510 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3511 * one jiffy's worth of data.
3513 * There are two possibilities here:
3515 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3516 * approximate range of 100 to 1024. That means that we really need to
3517 * make sure that the qdisc can buffer that much data.
3519 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3520 * has finely granular timers and there's no need to fudge additional room
3521 * for buffers. (There's no extra effort needed to implement that: the
3522 * large 'buffer_hz' is used as a divisor, so practically any number will
3523 * come out as 0 in the division. Small integer results in the case of
3524 * really high dividends won't have any real effect anyhow.)
3526 static unsigned int buffer_hz;
3528 /* Returns tc handle 'major':'minor'. */
3530 tc_make_handle(unsigned int major, unsigned int minor)
3532 return TC_H_MAKE(major << 16, minor);
3535 /* Returns the major number from 'handle'. */
3537 tc_get_major(unsigned int handle)
3539 return TC_H_MAJ(handle) >> 16;
3542 /* Returns the minor number from 'handle'. */
3544 tc_get_minor(unsigned int handle)
3546 return TC_H_MIN(handle);
3549 static struct tcmsg *
3550 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3551 struct ofpbuf *request)
3553 struct tcmsg *tcmsg;
3557 error = get_ifindex(netdev, &ifindex);
3562 ofpbuf_init(request, 512);
3563 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3564 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3565 tcmsg->tcm_family = AF_UNSPEC;
3566 tcmsg->tcm_ifindex = ifindex;
3567 /* Caller should fill in tcmsg->tcm_handle. */
3568 /* Caller should fill in tcmsg->tcm_parent. */
3574 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3576 int error = nl_sock_transact(rtnl_sock, request, replyp);
3577 ofpbuf_uninit(request);
3584 /* The values in psched are not individually very meaningful, but they are
3585 * important. The tables below show some values seen in the wild.
3589 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3590 * (Before that, there are hints that it was 1000000000.)
3592 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3596 * -----------------------------------
3597 * [1] 000c8000 000f4240 000f4240 00000064
3598 * [2] 000003e8 00000400 000f4240 3b9aca00
3599 * [3] 000003e8 00000400 000f4240 3b9aca00
3600 * [4] 000003e8 00000400 000f4240 00000064
3601 * [5] 000003e8 00000040 000f4240 3b9aca00
3602 * [6] 000003e8 00000040 000f4240 000000f9
3604 * a b c d ticks_per_s buffer_hz
3605 * ------- --------- ---------- ------------- ----------- -------------
3606 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3607 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3608 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3609 * [4] 1,000 1,024 1,000,000 100 976,562 100
3610 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3611 * [6] 1,000 64 1,000,000 249 15,625,000 249
3613 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3614 * [2] 2.6.26-1-686-bigmem from Debian lenny
3615 * [3] 2.6.26-2-sparc64 from Debian lenny
3616 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3617 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3618 * [6] 2.6.34 from kernel.org on KVM
3620 static const char fn[] = "/proc/net/psched";
3621 unsigned int a, b, c, d;
3627 stream = fopen(fn, "r");
3629 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3633 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3634 VLOG_WARN("%s: read failed", fn);
3638 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3642 VLOG_WARN("%s: invalid scheduler parameters", fn);
3646 ticks_per_s = (double) a * c / b;
3650 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3653 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3656 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3657 * rate of 'rate' bytes per second. */
3659 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3664 return (rate * ticks) / ticks_per_s;
3667 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3668 * rate of 'rate' bytes per second. */
3670 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3675 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3678 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3679 * a transmission rate of 'rate' bytes per second. */
3681 tc_buffer_per_jiffy(unsigned int rate)
3686 return rate / buffer_hz;
3689 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3690 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3691 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3692 * stores NULL into it if it is absent.
3694 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3697 * Returns 0 if successful, otherwise a positive errno value. */
3699 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3700 struct nlattr **options)
3702 static const struct nl_policy tca_policy[] = {
3703 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3704 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3706 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3708 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3709 tca_policy, ta, ARRAY_SIZE(ta))) {
3710 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3715 *kind = nl_attr_get_string(ta[TCA_KIND]);
3719 *options = ta[TCA_OPTIONS];
3734 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3735 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3736 * into '*options', and its queue statistics into '*stats'. Any of the output
3737 * arguments may be null.
3739 * Returns 0 if successful, otherwise a positive errno value. */
3741 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3742 struct nlattr **options, struct netdev_queue_stats *stats)
3744 static const struct nl_policy tca_policy[] = {
3745 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3746 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3748 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3750 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3751 tca_policy, ta, ARRAY_SIZE(ta))) {
3752 VLOG_WARN_RL(&rl, "failed to parse class message");
3757 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3758 *handlep = tc->tcm_handle;
3762 *options = ta[TCA_OPTIONS];
3766 const struct gnet_stats_queue *gsq;
3767 struct gnet_stats_basic gsb;
3769 static const struct nl_policy stats_policy[] = {
3770 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3771 .min_len = sizeof gsb },
3772 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3773 .min_len = sizeof *gsq },
3775 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3777 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3778 sa, ARRAY_SIZE(sa))) {
3779 VLOG_WARN_RL(&rl, "failed to parse class stats");
3783 /* Alignment issues screw up the length of struct gnet_stats_basic on
3784 * some arch/bitsize combinations. Newer versions of Linux have a
3785 * struct gnet_stats_basic_packed, but we can't depend on that. The
3786 * easiest thing to do is just to make a copy. */
3787 memset(&gsb, 0, sizeof gsb);
3788 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3789 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3790 stats->tx_bytes = gsb.bytes;
3791 stats->tx_packets = gsb.packets;
3793 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3794 stats->tx_errors = gsq->drops;
3804 memset(stats, 0, sizeof *stats);
3809 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3812 tc_query_class(const struct netdev *netdev,
3813 unsigned int handle, unsigned int parent,
3814 struct ofpbuf **replyp)
3816 struct ofpbuf request;
3817 struct tcmsg *tcmsg;
3820 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3824 tcmsg->tcm_handle = handle;
3825 tcmsg->tcm_parent = parent;
3827 error = tc_transact(&request, replyp);
3829 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3830 netdev_get_name(netdev),
3831 tc_get_major(handle), tc_get_minor(handle),
3832 tc_get_major(parent), tc_get_minor(parent),
3838 /* Equivalent to "tc class del dev <name> handle <handle>". */
3840 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3842 struct ofpbuf request;
3843 struct tcmsg *tcmsg;
3846 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3850 tcmsg->tcm_handle = handle;
3851 tcmsg->tcm_parent = 0;
3853 error = tc_transact(&request, NULL);
3855 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3856 netdev_get_name(netdev),
3857 tc_get_major(handle), tc_get_minor(handle),
3863 /* Equivalent to "tc qdisc del dev <name> root". */
3865 tc_del_qdisc(struct netdev *netdev)
3867 struct netdev_dev_linux *netdev_dev =
3868 netdev_dev_linux_cast(netdev_get_dev(netdev));
3869 struct ofpbuf request;
3870 struct tcmsg *tcmsg;
3873 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3877 tcmsg->tcm_handle = tc_make_handle(1, 0);
3878 tcmsg->tcm_parent = TC_H_ROOT;
3880 error = tc_transact(&request, NULL);
3881 if (error == EINVAL) {
3882 /* EINVAL probably means that the default qdisc was in use, in which
3883 * case we've accomplished our purpose. */
3886 if (!error && netdev_dev->tc) {
3887 if (netdev_dev->tc->ops->tc_destroy) {
3888 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3890 netdev_dev->tc = NULL;
3895 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3896 * kernel to determine what they are. Returns 0 if successful, otherwise a
3897 * positive errno value. */
3899 tc_query_qdisc(const struct netdev *netdev)
3901 struct netdev_dev_linux *netdev_dev =
3902 netdev_dev_linux_cast(netdev_get_dev(netdev));
3903 struct ofpbuf request, *qdisc;
3904 const struct tc_ops *ops;
3905 struct tcmsg *tcmsg;
3909 if (netdev_dev->tc) {
3913 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3914 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3915 * 2.6.35 without that fix backported to it.
3917 * To avoid the OOPS, we must not make a request that would attempt to dump
3918 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3919 * few others. There are a few ways that I can see to do this, but most of
3920 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3921 * technique chosen here is to assume that any non-default qdisc that we
3922 * create will have a class with handle 1:0. The built-in qdiscs only have
3923 * a class with handle 0:0.
3925 * We could check for Linux 2.6.35+ and use a more straightforward method
3927 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3931 tcmsg->tcm_handle = tc_make_handle(1, 0);
3932 tcmsg->tcm_parent = 0;
3934 /* Figure out what tc class to instantiate. */
3935 error = tc_transact(&request, &qdisc);
3939 error = tc_parse_qdisc(qdisc, &kind, NULL);
3941 ops = &tc_ops_other;
3943 ops = tc_lookup_linux_name(kind);
3945 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3946 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3948 ops = &tc_ops_other;
3951 } else if (error == ENOENT) {
3952 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3953 * other entity that doesn't have a handle 1:0. We will assume
3954 * that it's the system default qdisc. */
3955 ops = &tc_ops_default;
3958 /* Who knows? Maybe the device got deleted. */
3959 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3960 netdev_get_name(netdev), strerror(error));
3961 ops = &tc_ops_other;
3964 /* Instantiate it. */
3965 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3966 assert((load_error == 0) == (netdev_dev->tc != NULL));
3967 ofpbuf_delete(qdisc);
3969 return error ? error : load_error;
3972 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3973 approximate the time to transmit packets of various lengths. For an MTU of
3974 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3975 represents two possible packet lengths; for a MTU of 513 through 1024, four
3976 possible lengths; and so on.
3978 Returns, for the specified 'mtu', the number of bits that packet lengths
3979 need to be shifted right to fit within such a 256-entry table. */
3981 tc_calc_cell_log(unsigned int mtu)
3986 mtu = ETH_PAYLOAD_MAX;
3988 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3990 for (cell_log = 0; mtu >= 256; cell_log++) {
3997 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4000 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4002 memset(rate, 0, sizeof *rate);
4003 rate->cell_log = tc_calc_cell_log(mtu);
4004 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4005 /* rate->cell_align = 0; */ /* distro headers. */
4006 rate->mpu = ETH_TOTAL_MIN;
4010 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4011 * attribute of the specified "type".
4013 * See tc_calc_cell_log() above for a description of "rtab"s. */
4015 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4020 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4021 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4022 unsigned packet_size = (i + 1) << rate->cell_log;
4023 if (packet_size < rate->mpu) {
4024 packet_size = rate->mpu;
4026 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4030 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4031 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4032 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4035 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4037 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4038 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4041 /* Copies 'src' into 'dst', performing format conversion in the process. */
4043 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4044 const struct rtnl_link_stats *src)
4046 dst->rx_packets = src->rx_packets;
4047 dst->tx_packets = src->tx_packets;
4048 dst->rx_bytes = src->rx_bytes;
4049 dst->tx_bytes = src->tx_bytes;
4050 dst->rx_errors = src->rx_errors;
4051 dst->tx_errors = src->tx_errors;
4052 dst->rx_dropped = src->rx_dropped;
4053 dst->tx_dropped = src->tx_dropped;
4054 dst->multicast = src->multicast;
4055 dst->collisions = src->collisions;
4056 dst->rx_length_errors = src->rx_length_errors;
4057 dst->rx_over_errors = src->rx_over_errors;
4058 dst->rx_crc_errors = src->rx_crc_errors;
4059 dst->rx_frame_errors = src->rx_frame_errors;
4060 dst->rx_fifo_errors = src->rx_fifo_errors;
4061 dst->rx_missed_errors = src->rx_missed_errors;
4062 dst->tx_aborted_errors = src->tx_aborted_errors;
4063 dst->tx_carrier_errors = src->tx_carrier_errors;
4064 dst->tx_fifo_errors = src->tx_fifo_errors;
4065 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4066 dst->tx_window_errors = src->tx_window_errors;
4070 /* Utility functions. */
4073 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4075 /* Policy for RTNLGRP_LINK messages.
4077 * There are *many* more fields in these messages, but currently we only
4078 * care about these fields. */
4079 static const struct nl_policy rtnlgrp_link_policy[] = {
4080 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4081 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4082 .min_len = sizeof(struct rtnl_link_stats) },
4085 struct ofpbuf request;
4086 struct ofpbuf *reply;
4087 struct ifinfomsg *ifi;
4088 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4091 ofpbuf_init(&request, 0);
4092 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4093 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4094 ifi->ifi_family = PF_UNSPEC;
4095 ifi->ifi_index = ifindex;
4096 error = nl_sock_transact(rtnl_sock, &request, &reply);
4097 ofpbuf_uninit(&request);
4102 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4103 rtnlgrp_link_policy,
4104 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4105 ofpbuf_delete(reply);
4109 if (!attrs[IFLA_STATS]) {
4110 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4111 ofpbuf_delete(reply);
4115 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4117 ofpbuf_delete(reply);
4123 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4125 static const char fn[] = "/proc/net/dev";
4130 stream = fopen(fn, "r");
4132 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4137 while (fgets(line, sizeof line, stream)) {
4140 #define X64 "%"SCNu64
4143 X64 X64 X64 X64 X64 X64 X64 "%*u"
4144 X64 X64 X64 X64 X64 X64 X64 "%*u",
4150 &stats->rx_fifo_errors,
4151 &stats->rx_frame_errors,
4157 &stats->tx_fifo_errors,
4159 &stats->tx_carrier_errors) != 15) {
4160 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4161 } else if (!strcmp(devname, netdev_name)) {
4162 stats->rx_length_errors = UINT64_MAX;
4163 stats->rx_over_errors = UINT64_MAX;
4164 stats->rx_crc_errors = UINT64_MAX;
4165 stats->rx_missed_errors = UINT64_MAX;
4166 stats->tx_aborted_errors = UINT64_MAX;
4167 stats->tx_heartbeat_errors = UINT64_MAX;
4168 stats->tx_window_errors = UINT64_MAX;
4174 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4180 get_flags(const struct netdev *netdev, int *flags)
4185 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4187 *flags = ifr.ifr_flags;
4192 set_flags(struct netdev *netdev, int flags)
4196 ifr.ifr_flags = flags;
4197 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4202 do_get_ifindex(const char *netdev_name)
4206 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4207 COVERAGE_INC(netdev_get_ifindex);
4208 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4209 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4210 netdev_name, strerror(errno));
4213 return ifr.ifr_ifindex;
4217 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4219 struct netdev_dev_linux *netdev_dev =
4220 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4222 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4223 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4227 netdev_dev->cache_valid |= VALID_IFINDEX;
4228 netdev_dev->ifindex = ifindex;
4230 *ifindexp = netdev_dev->ifindex;
4235 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4240 memset(&ifr, 0, sizeof ifr);
4241 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4242 COVERAGE_INC(netdev_get_hwaddr);
4243 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4244 /* ENODEV probably means that a vif disappeared asynchronously and
4245 * hasn't been removed from the database yet, so reduce the log level
4246 * to INFO for that case. */
4247 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4248 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4249 netdev_name, strerror(errno));
4252 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4253 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4254 VLOG_WARN("%s device has unknown hardware address family %d",
4255 netdev_name, hwaddr_family);
4257 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4262 set_etheraddr(const char *netdev_name, int hwaddr_family,
4263 const uint8_t mac[ETH_ADDR_LEN])
4267 memset(&ifr, 0, sizeof ifr);
4268 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4269 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4270 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4271 COVERAGE_INC(netdev_set_hwaddr);
4272 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4273 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4274 netdev_name, strerror(errno));
4281 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4282 int cmd, const char *cmd_name)
4286 memset(&ifr, 0, sizeof ifr);
4287 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4288 ifr.ifr_data = (caddr_t) ecmd;
4291 COVERAGE_INC(netdev_ethtool);
4292 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4295 if (errno != EOPNOTSUPP) {
4296 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4297 "failed: %s", cmd_name, name, strerror(errno));
4299 /* The device doesn't support this operation. That's pretty
4300 * common, so there's no point in logging anything. */
4306 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4307 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4309 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4310 const char *flag_name, bool enable)
4312 const char *netdev_name = netdev_get_name(netdev);
4313 struct ethtool_value evalue;
4317 memset(&evalue, 0, sizeof evalue);
4318 error = netdev_linux_do_ethtool(netdev_name,
4319 (struct ethtool_cmd *)&evalue,
4320 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4325 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4326 error = netdev_linux_do_ethtool(netdev_name,
4327 (struct ethtool_cmd *)&evalue,
4328 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4333 memset(&evalue, 0, sizeof evalue);
4334 error = netdev_linux_do_ethtool(netdev_name,
4335 (struct ethtool_cmd *)&evalue,
4336 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4341 if (new_flags != evalue.data) {
4342 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4343 "device %s failed", enable ? "enable" : "disable",
4344 flag_name, netdev_name);
4352 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4353 const char *cmd_name)
4355 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4356 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4357 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4365 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4366 int cmd, const char *cmd_name)
4371 ifr.ifr_addr.sa_family = AF_INET;
4372 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4374 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4375 *ip = sin->sin_addr;
4380 /* Returns an AF_PACKET raw socket or a negative errno value. */
4382 af_packet_sock(void)
4384 static int sock = INT_MIN;
4386 if (sock == INT_MIN) {
4387 sock = socket(AF_PACKET, SOCK_RAW, 0);
4389 set_nonblocking(sock);
4392 VLOG_ERR("failed to create packet socket: %s", strerror(errno));