2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-linux.h"
52 #include "dynamic-string.h"
53 #include "fatal-signal.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
62 #include "openflow/openflow.h"
64 #include "poll-loop.h"
65 #include "rtnetlink-link.h"
66 #include "socket-util.h"
72 VLOG_DEFINE_THIS_MODULE(netdev_linux);
74 COVERAGE_DEFINE(netdev_get_vlan_vid);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_ethtool);
82 /* These were introduced in Linux 2.6.14, so they might be missing if we have
84 #ifndef ADVERTISED_Pause
85 #define ADVERTISED_Pause (1 << 13)
87 #ifndef ADVERTISED_Asym_Pause
88 #define ADVERTISED_Asym_Pause (1 << 14)
91 /* These were introduced in Linux 2.6.24, so they might be missing if we
92 * have old headers. */
93 #ifndef ETHTOOL_GFLAGS
94 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96 #ifndef ETHTOOL_SFLAGS
97 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 #define TC_RTAB_SIZE 1024
106 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
107 static int cache_notifier_refcount;
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
115 VALID_CARRIER = 1 << 5,
116 VALID_POLICING = 1 << 6,
117 VALID_HAVE_VPORT_STATS = 1 << 7
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
331 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
332 struct nlattr **options);
333 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
334 struct nlattr **options,
335 struct netdev_queue_stats *);
336 static int tc_query_class(const struct netdev *,
337 unsigned int handle, unsigned int parent,
338 struct ofpbuf **replyp);
339 static int tc_delete_class(const struct netdev *, unsigned int handle);
341 static int tc_del_qdisc(struct netdev *netdev);
342 static int tc_query_qdisc(const struct netdev *netdev);
344 static int tc_calc_cell_log(unsigned int mtu);
345 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
346 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
347 const struct tc_ratespec *rate);
348 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
350 struct netdev_dev_linux {
351 struct netdev_dev netdev_dev;
353 struct shash_node *shash_node;
354 unsigned int cache_valid;
355 unsigned int change_seq;
357 bool miimon; /* Link status of last poll. */
358 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
359 struct timer miimon_timer;
361 /* The following are figured out "on demand" only. They are only valid
362 * when the corresponding VALID_* bit in 'cache_valid' is set. */
364 uint8_t etheraddr[ETH_ADDR_LEN];
365 struct in_addr address, netmask;
369 uint32_t kbits_rate; /* Policing data. */
370 uint32_t kbits_burst;
371 bool have_vport_stats;
375 struct tap_state tap;
379 struct netdev_linux {
380 struct netdev netdev;
384 /* Sockets used for ioctl operations. */
385 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
387 /* A Netlink routing socket that is not subscribed to any multicast groups. */
388 static struct nl_sock *rtnl_sock;
390 /* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394 static int netdev_linux_init(void);
396 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397 int cmd, const char *cmd_name);
398 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
400 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
402 static int get_flags(const struct netdev *, int *flagsp);
403 static int set_flags(struct netdev *, int flags);
404 static int do_get_ifindex(const char *netdev_name);
405 static int get_ifindex(const struct netdev *, int *ifindexp);
406 static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
414 static int af_packet_sock(void);
415 static void netdev_linux_miimon_run(void);
416 static void netdev_linux_miimon_wait(void);
419 is_netdev_linux_class(const struct netdev_class *netdev_class)
421 return netdev_class->init == netdev_linux_init;
424 static struct netdev_dev_linux *
425 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
427 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
428 assert(is_netdev_linux_class(netdev_class));
430 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
433 static struct netdev_linux *
434 netdev_linux_cast(const struct netdev *netdev)
436 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
437 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
438 assert(is_netdev_linux_class(netdev_class));
440 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
444 netdev_linux_init(void)
446 static int status = -1;
448 /* Create AF_INET socket. */
449 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
450 status = af_inet_sock >= 0 ? 0 : errno;
452 VLOG_ERR("failed to create inet socket: %s", strerror(status));
455 /* Create rtnetlink socket. */
457 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
459 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
468 netdev_linux_run(void)
470 rtnetlink_link_run();
471 netdev_linux_miimon_run();
475 netdev_linux_wait(void)
477 rtnetlink_link_wait();
478 netdev_linux_miimon_wait();
482 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
485 if (!dev->change_seq) {
488 dev->cache_valid = 0;
492 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
493 void *aux OVS_UNUSED)
495 struct netdev_dev_linux *dev;
497 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
499 const struct netdev_class *netdev_class =
500 netdev_dev_get_class(base_dev);
502 if (is_netdev_linux_class(netdev_class)) {
503 dev = netdev_dev_linux_cast(base_dev);
504 netdev_dev_linux_changed(dev);
508 struct shash device_shash;
509 struct shash_node *node;
511 shash_init(&device_shash);
512 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
513 SHASH_FOR_EACH (node, &device_shash) {
515 netdev_dev_linux_changed(dev);
517 shash_destroy(&device_shash);
521 /* Creates system and internal devices. */
523 netdev_linux_create(const struct netdev_class *class, const char *name,
524 struct netdev_dev **netdev_devp)
526 struct netdev_dev_linux *netdev_dev;
528 if (!cache_notifier_refcount) {
529 assert(!netdev_linux_cache_notifier);
531 netdev_linux_cache_notifier =
532 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
534 if (!netdev_linux_cache_notifier) {
538 cache_notifier_refcount++;
540 netdev_dev = xzalloc(sizeof *netdev_dev);
541 netdev_dev->change_seq = 1;
542 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
544 *netdev_devp = &netdev_dev->netdev_dev;
548 /* For most types of netdevs we open the device for each call of
549 * netdev_open(). However, this is not the case with tap devices,
550 * since it is only possible to open the device once. In this
551 * situation we share a single file descriptor, and consequently
552 * buffers, across all readers. Therefore once data is read it will
553 * be unavailable to other reads for tap devices. */
555 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
556 const char *name, struct netdev_dev **netdev_devp)
558 struct netdev_dev_linux *netdev_dev;
559 struct tap_state *state;
560 static const char tap_dev[] = "/dev/net/tun";
564 netdev_dev = xzalloc(sizeof *netdev_dev);
565 state = &netdev_dev->state.tap;
567 /* Open tap device. */
568 state->fd = open(tap_dev, O_RDWR);
571 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
575 /* Create tap device. */
576 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
577 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
578 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
579 VLOG_WARN("%s: creating tap device failed: %s", name,
585 /* Make non-blocking. */
586 error = set_nonblocking(state->fd);
591 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
592 *netdev_devp = &netdev_dev->netdev_dev;
601 destroy_tap(struct netdev_dev_linux *netdev_dev)
603 struct tap_state *state = &netdev_dev->state.tap;
605 if (state->fd >= 0) {
610 /* Destroys the netdev device 'netdev_dev_'. */
612 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
614 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
615 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
617 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
618 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
621 if (class == &netdev_linux_class || class == &netdev_internal_class) {
622 cache_notifier_refcount--;
624 if (!cache_notifier_refcount) {
625 assert(netdev_linux_cache_notifier);
626 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
627 netdev_linux_cache_notifier = NULL;
629 } else if (class == &netdev_tap_class) {
630 destroy_tap(netdev_dev);
639 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
641 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
642 struct netdev_linux *netdev;
643 enum netdev_flags flags;
646 /* Allocate network device. */
647 netdev = xzalloc(sizeof *netdev);
649 netdev_init(&netdev->netdev, netdev_dev_);
651 /* Verify that the device really exists, by attempting to read its flags.
652 * (The flags might be cached, in which case this won't actually do an
655 * Don't do this for "internal" netdevs, though, because those have to be
656 * created as netdev objects before they exist in the kernel, because
657 * creating them in the kernel happens by passing a netdev object to
658 * dpif_port_add(). */
659 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
660 error = netdev_get_flags(&netdev->netdev, &flags);
661 if (error == ENODEV) {
666 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
667 !netdev_dev->state.tap.opened) {
669 /* We assume that the first user of the tap device is the primary user
670 * and give them the tap FD. Subsequent users probably just expect
671 * this to be a system device so open it normally to avoid send/receive
672 * directions appearing to be reversed. */
673 netdev->fd = netdev_dev->state.tap.fd;
674 netdev_dev->state.tap.opened = true;
677 *netdevp = &netdev->netdev;
681 netdev_uninit(&netdev->netdev, true);
685 /* Closes and destroys 'netdev'. */
687 netdev_linux_close(struct netdev *netdev_)
689 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
691 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
697 /* Initializes 'sset' with a list of the names of all known network devices. */
699 netdev_linux_enumerate(struct sset *sset)
701 struct if_nameindex *names;
703 names = if_nameindex();
707 for (i = 0; names[i].if_name != NULL; i++) {
708 sset_add(sset, names[i].if_name);
710 if_freenameindex(names);
713 VLOG_WARN("could not obtain list of network device names: %s",
720 netdev_linux_listen(struct netdev *netdev_)
722 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
723 struct sockaddr_ll sll;
728 if (netdev->fd >= 0) {
732 /* Create file descriptor. */
733 fd = socket(PF_PACKET, SOCK_RAW, 0);
736 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
740 /* Set non-blocking mode. */
741 error = set_nonblocking(fd);
746 /* Get ethernet device index. */
747 error = get_ifindex(&netdev->netdev, &ifindex);
752 /* Bind to specific ethernet device. */
753 memset(&sll, 0, sizeof sll);
754 sll.sll_family = AF_PACKET;
755 sll.sll_ifindex = ifindex;
756 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
757 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
759 VLOG_ERR("%s: failed to bind raw socket (%s)",
760 netdev_get_name(netdev_), strerror(error));
775 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
777 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
779 if (netdev->fd < 0) {
780 /* Device is not listening. */
785 ssize_t retval = read(netdev->fd, data, size);
788 } else if (errno != EINTR) {
789 if (errno != EAGAIN) {
790 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
791 strerror(errno), netdev_get_name(netdev_));
798 /* Registers with the poll loop to wake up from the next call to poll_block()
799 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
801 netdev_linux_recv_wait(struct netdev *netdev_)
803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
804 if (netdev->fd >= 0) {
805 poll_fd_wait(netdev->fd, POLLIN);
809 /* Discards all packets waiting to be received from 'netdev'. */
811 netdev_linux_drain(struct netdev *netdev_)
813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
814 if (netdev->fd < 0) {
816 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
818 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
819 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
823 drain_fd(netdev->fd, ifr.ifr_qlen);
826 return drain_rcvbuf(netdev->fd);
830 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
831 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
832 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
833 * the packet is too big or too small to transmit on the device.
835 * The caller retains ownership of 'buffer' in all cases.
837 * The kernel maintains a packet transmission queue, so the caller is not
838 * expected to do additional queuing of packets. */
840 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
842 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
846 if (netdev->fd < 0) {
847 /* Use our AF_PACKET socket to send to this device. */
848 struct sockaddr_ll sll;
855 sock = af_packet_sock();
860 error = get_ifindex(netdev_, &ifindex);
865 /* We don't bother setting most fields in sockaddr_ll because the
866 * kernel ignores them for SOCK_RAW. */
867 memset(&sll, 0, sizeof sll);
868 sll.sll_family = AF_PACKET;
869 sll.sll_ifindex = ifindex;
871 iov.iov_base = (void *) data;
875 msg.msg_namelen = sizeof sll;
878 msg.msg_control = NULL;
879 msg.msg_controllen = 0;
882 retval = sendmsg(sock, &msg, 0);
884 /* Use the netdev's own fd to send to this device. This is
885 * essential for tap devices, because packets sent to a tap device
886 * with an AF_PACKET socket will loop back to be *received* again
887 * on the tap device. */
888 retval = write(netdev->fd, data, size);
892 /* The Linux AF_PACKET implementation never blocks waiting for room
893 * for packets, instead returning ENOBUFS. Translate this into
894 * EAGAIN for the caller. */
895 if (errno == ENOBUFS) {
897 } else if (errno == EINTR) {
899 } else if (errno != EAGAIN) {
900 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
901 netdev_get_name(netdev_), strerror(errno));
904 } else if (retval != size) {
905 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
906 "%zu) on %s", retval, size, netdev_get_name(netdev_));
914 /* Registers with the poll loop to wake up from the next call to poll_block()
915 * when the packet transmission queue has sufficient room to transmit a packet
916 * with netdev_send().
918 * The kernel maintains a packet transmission queue, so the client is not
919 * expected to do additional queuing of packets. Thus, this function is
920 * unlikely to ever be used. It is included for completeness. */
922 netdev_linux_send_wait(struct netdev *netdev_)
924 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
925 if (netdev->fd < 0) {
927 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
928 poll_fd_wait(netdev->fd, POLLOUT);
930 /* TAP device always accepts packets.*/
931 poll_immediate_wake();
935 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
936 * otherwise a positive errno value. */
938 netdev_linux_set_etheraddr(struct netdev *netdev_,
939 const uint8_t mac[ETH_ADDR_LEN])
941 struct netdev_dev_linux *netdev_dev =
942 netdev_dev_linux_cast(netdev_get_dev(netdev_));
945 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
946 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
947 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
949 netdev_dev->cache_valid |= VALID_ETHERADDR;
950 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
958 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
959 * free the returned buffer. */
961 netdev_linux_get_etheraddr(const struct netdev *netdev_,
962 uint8_t mac[ETH_ADDR_LEN])
964 struct netdev_dev_linux *netdev_dev =
965 netdev_dev_linux_cast(netdev_get_dev(netdev_));
966 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
967 int error = get_etheraddr(netdev_get_name(netdev_),
968 netdev_dev->etheraddr);
972 netdev_dev->cache_valid |= VALID_ETHERADDR;
974 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
978 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
979 * in bytes, not including the hardware header; thus, this is typically 1500
980 * bytes for Ethernet devices. */
982 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
984 struct netdev_dev_linux *netdev_dev =
985 netdev_dev_linux_cast(netdev_get_dev(netdev_));
986 if (!(netdev_dev->cache_valid & VALID_MTU)) {
990 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
991 SIOCGIFMTU, "SIOCGIFMTU");
995 netdev_dev->mtu = ifr.ifr_mtu;
996 netdev_dev->cache_valid |= VALID_MTU;
998 *mtup = netdev_dev->mtu;
1002 /* Sets the maximum size of transmitted (MTU) for given device using linux
1003 * networking ioctl interface.
1006 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1008 struct netdev_dev_linux *netdev_dev =
1009 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1014 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1015 SIOCSIFMTU, "SIOCSIFMTU");
1020 netdev_dev->mtu = ifr.ifr_mtu;
1021 netdev_dev->cache_valid |= VALID_MTU;
1025 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1026 * On failure, returns a negative errno value. */
1028 netdev_linux_get_ifindex(const struct netdev *netdev)
1032 error = get_ifindex(netdev, &ifindex);
1033 return error ? -error : ifindex;
1037 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1039 struct netdev_dev_linux *netdev_dev =
1040 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1045 if (netdev_dev->miimon_interval > 0) {
1046 *carrier = netdev_dev->miimon;
1050 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1054 fn = xasprintf("/sys/class/net/%s/carrier",
1055 netdev_get_name(netdev_));
1056 fd = open(fn, O_RDONLY);
1059 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1063 retval = read(fd, line, sizeof line);
1066 if (error == EINVAL) {
1067 /* This is the normal return value when we try to check carrier
1068 * if the network device is not up. */
1070 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1073 } else if (retval == 0) {
1075 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1079 if (line[0] != '0' && line[0] != '1') {
1081 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1085 netdev_dev->carrier = line[0] != '0';
1086 netdev_dev->cache_valid |= VALID_CARRIER;
1088 *carrier = netdev_dev->carrier;
1100 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1101 struct mii_ioctl_data *data)
1106 memset(&ifr, 0, sizeof ifr);
1107 memcpy(&ifr.ifr_data, data, sizeof *data);
1108 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1109 memcpy(data, &ifr.ifr_data, sizeof *data);
1115 netdev_linux_get_miimon(const char *name, bool *miimon)
1117 struct mii_ioctl_data data;
1122 memset(&data, 0, sizeof data);
1123 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1125 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1126 data.reg_num = MII_BMSR;
1127 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1131 *miimon = !!(data.val_out & BMSR_LSTATUS);
1133 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1136 struct ethtool_cmd ecmd;
1138 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1141 memset(&ecmd, 0, sizeof ecmd);
1142 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1145 struct ethtool_value eval;
1147 memcpy(&eval, &ecmd, sizeof eval);
1148 *miimon = !!eval.data;
1150 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1158 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1159 long long int interval)
1161 struct netdev_dev_linux *netdev_dev;
1163 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1165 interval = interval > 0 ? MAX(interval, 100) : 0;
1166 if (netdev_dev->miimon_interval != interval) {
1167 netdev_dev->miimon_interval = interval;
1168 timer_set_expired(&netdev_dev->miimon_timer);
1175 netdev_linux_miimon_run(void)
1177 struct shash device_shash;
1178 struct shash_node *node;
1180 shash_init(&device_shash);
1181 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1182 SHASH_FOR_EACH (node, &device_shash) {
1183 struct netdev_dev_linux *dev = node->data;
1186 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1190 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1191 if (miimon != dev->miimon) {
1192 dev->miimon = miimon;
1193 netdev_dev_linux_changed(dev);
1196 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1199 shash_destroy(&device_shash);
1203 netdev_linux_miimon_wait(void)
1205 struct shash device_shash;
1206 struct shash_node *node;
1208 shash_init(&device_shash);
1209 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1210 SHASH_FOR_EACH (node, &device_shash) {
1211 struct netdev_dev_linux *dev = node->data;
1213 if (dev->miimon_interval > 0) {
1214 timer_wait(&dev->miimon_timer);
1217 shash_destroy(&device_shash);
1220 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1221 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1224 check_for_working_netlink_stats(void)
1226 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1227 * preferable, so if that works, we'll use it. */
1228 int ifindex = do_get_ifindex("lo");
1230 VLOG_WARN("failed to get ifindex for lo, "
1231 "obtaining netdev stats from proc");
1234 struct netdev_stats stats;
1235 int error = get_stats_via_netlink(ifindex, &stats);
1237 VLOG_DBG("obtaining netdev stats via rtnetlink");
1240 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1241 "via proc (you are probably running a pre-2.6.19 "
1242 "kernel)", strerror(error));
1249 swap_uint64(uint64_t *a, uint64_t *b)
1257 get_stats_via_vport(const struct netdev *netdev_,
1258 struct netdev_stats *stats)
1260 struct netdev_dev_linux *netdev_dev =
1261 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1263 if (netdev_dev->have_vport_stats ||
1264 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1267 error = netdev_vport_get_stats(netdev_, stats);
1269 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1270 netdev_get_name(netdev_), error);
1272 netdev_dev->have_vport_stats = !error;
1273 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1278 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1279 struct netdev_stats *stats)
1281 static int use_netlink_stats = -1;
1284 if (use_netlink_stats < 0) {
1285 use_netlink_stats = check_for_working_netlink_stats();
1288 if (use_netlink_stats) {
1291 error = get_ifindex(netdev_, &ifindex);
1293 error = get_stats_via_netlink(ifindex, stats);
1296 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1300 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1301 netdev_get_name(netdev_), error);
1307 /* Retrieves current device stats for 'netdev-linux'. */
1309 netdev_linux_get_stats(const struct netdev *netdev_,
1310 struct netdev_stats *stats)
1312 struct netdev_dev_linux *netdev_dev =
1313 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1314 struct netdev_stats dev_stats;
1317 get_stats_via_vport(netdev_, stats);
1319 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1322 if (!netdev_dev->have_vport_stats) {
1329 if (!netdev_dev->have_vport_stats) {
1330 /* stats not available from OVS then use ioctl stats. */
1333 stats->rx_errors += dev_stats.rx_errors;
1334 stats->tx_errors += dev_stats.tx_errors;
1335 stats->rx_dropped += dev_stats.rx_dropped;
1336 stats->tx_dropped += dev_stats.tx_dropped;
1337 stats->multicast += dev_stats.multicast;
1338 stats->collisions += dev_stats.collisions;
1339 stats->rx_length_errors += dev_stats.rx_length_errors;
1340 stats->rx_over_errors += dev_stats.rx_over_errors;
1341 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1342 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1343 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1344 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1345 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1346 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1347 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1348 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1349 stats->tx_window_errors += dev_stats.tx_window_errors;
1354 /* Retrieves current device stats for 'netdev-tap' netdev or
1355 * netdev-internal. */
1357 netdev_pseudo_get_stats(const struct netdev *netdev_,
1358 struct netdev_stats *stats)
1360 struct netdev_dev_linux *netdev_dev =
1361 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1362 struct netdev_stats dev_stats;
1365 get_stats_via_vport(netdev_, stats);
1367 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1369 if (!netdev_dev->have_vport_stats) {
1376 /* If this port is an internal port then the transmit and receive stats
1377 * will appear to be swapped relative to the other ports since we are the
1378 * one sending the data, not a remote computer. For consistency, we swap
1379 * them back here. This does not apply if we are getting stats from the
1380 * vport layer because it always tracks stats from the perspective of the
1382 if (!netdev_dev->have_vport_stats) {
1384 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1385 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1386 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1387 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1388 stats->rx_length_errors = 0;
1389 stats->rx_over_errors = 0;
1390 stats->rx_crc_errors = 0;
1391 stats->rx_frame_errors = 0;
1392 stats->rx_fifo_errors = 0;
1393 stats->rx_missed_errors = 0;
1394 stats->tx_aborted_errors = 0;
1395 stats->tx_carrier_errors = 0;
1396 stats->tx_fifo_errors = 0;
1397 stats->tx_heartbeat_errors = 0;
1398 stats->tx_window_errors = 0;
1400 stats->rx_dropped += dev_stats.tx_dropped;
1401 stats->tx_dropped += dev_stats.rx_dropped;
1403 stats->rx_errors += dev_stats.tx_errors;
1404 stats->tx_errors += dev_stats.rx_errors;
1406 stats->multicast += dev_stats.multicast;
1407 stats->collisions += dev_stats.collisions;
1412 /* Stores the features supported by 'netdev' into each of '*current',
1413 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1414 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1415 * successful, otherwise a positive errno value. */
1417 netdev_linux_get_features(const struct netdev *netdev,
1418 uint32_t *current, uint32_t *advertised,
1419 uint32_t *supported, uint32_t *peer)
1421 struct ethtool_cmd ecmd;
1424 memset(&ecmd, 0, sizeof ecmd);
1425 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1426 ETHTOOL_GSET, "ETHTOOL_GSET");
1431 /* Supported features. */
1433 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1434 *supported |= OFPPF_10MB_HD;
1436 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1437 *supported |= OFPPF_10MB_FD;
1439 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1440 *supported |= OFPPF_100MB_HD;
1442 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1443 *supported |= OFPPF_100MB_FD;
1445 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1446 *supported |= OFPPF_1GB_HD;
1448 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1449 *supported |= OFPPF_1GB_FD;
1451 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1452 *supported |= OFPPF_10GB_FD;
1454 if (ecmd.supported & SUPPORTED_TP) {
1455 *supported |= OFPPF_COPPER;
1457 if (ecmd.supported & SUPPORTED_FIBRE) {
1458 *supported |= OFPPF_FIBER;
1460 if (ecmd.supported & SUPPORTED_Autoneg) {
1461 *supported |= OFPPF_AUTONEG;
1463 if (ecmd.supported & SUPPORTED_Pause) {
1464 *supported |= OFPPF_PAUSE;
1466 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1467 *supported |= OFPPF_PAUSE_ASYM;
1470 /* Advertised features. */
1472 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1473 *advertised |= OFPPF_10MB_HD;
1475 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1476 *advertised |= OFPPF_10MB_FD;
1478 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1479 *advertised |= OFPPF_100MB_HD;
1481 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1482 *advertised |= OFPPF_100MB_FD;
1484 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1485 *advertised |= OFPPF_1GB_HD;
1487 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1488 *advertised |= OFPPF_1GB_FD;
1490 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1491 *advertised |= OFPPF_10GB_FD;
1493 if (ecmd.advertising & ADVERTISED_TP) {
1494 *advertised |= OFPPF_COPPER;
1496 if (ecmd.advertising & ADVERTISED_FIBRE) {
1497 *advertised |= OFPPF_FIBER;
1499 if (ecmd.advertising & ADVERTISED_Autoneg) {
1500 *advertised |= OFPPF_AUTONEG;
1502 if (ecmd.advertising & ADVERTISED_Pause) {
1503 *advertised |= OFPPF_PAUSE;
1505 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1506 *advertised |= OFPPF_PAUSE_ASYM;
1509 /* Current settings. */
1510 if (ecmd.speed == SPEED_10) {
1511 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1512 } else if (ecmd.speed == SPEED_100) {
1513 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1514 } else if (ecmd.speed == SPEED_1000) {
1515 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1516 } else if (ecmd.speed == SPEED_10000) {
1517 *current = OFPPF_10GB_FD;
1522 if (ecmd.port == PORT_TP) {
1523 *current |= OFPPF_COPPER;
1524 } else if (ecmd.port == PORT_FIBRE) {
1525 *current |= OFPPF_FIBER;
1529 *current |= OFPPF_AUTONEG;
1532 /* Peer advertisements. */
1533 *peer = 0; /* XXX */
1538 /* Set the features advertised by 'netdev' to 'advertise'. */
1540 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1542 struct ethtool_cmd ecmd;
1545 memset(&ecmd, 0, sizeof ecmd);
1546 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1547 ETHTOOL_GSET, "ETHTOOL_GSET");
1552 ecmd.advertising = 0;
1553 if (advertise & OFPPF_10MB_HD) {
1554 ecmd.advertising |= ADVERTISED_10baseT_Half;
1556 if (advertise & OFPPF_10MB_FD) {
1557 ecmd.advertising |= ADVERTISED_10baseT_Full;
1559 if (advertise & OFPPF_100MB_HD) {
1560 ecmd.advertising |= ADVERTISED_100baseT_Half;
1562 if (advertise & OFPPF_100MB_FD) {
1563 ecmd.advertising |= ADVERTISED_100baseT_Full;
1565 if (advertise & OFPPF_1GB_HD) {
1566 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1568 if (advertise & OFPPF_1GB_FD) {
1569 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1571 if (advertise & OFPPF_10GB_FD) {
1572 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1574 if (advertise & OFPPF_COPPER) {
1575 ecmd.advertising |= ADVERTISED_TP;
1577 if (advertise & OFPPF_FIBER) {
1578 ecmd.advertising |= ADVERTISED_FIBRE;
1580 if (advertise & OFPPF_AUTONEG) {
1581 ecmd.advertising |= ADVERTISED_Autoneg;
1583 if (advertise & OFPPF_PAUSE) {
1584 ecmd.advertising |= ADVERTISED_Pause;
1586 if (advertise & OFPPF_PAUSE_ASYM) {
1587 ecmd.advertising |= ADVERTISED_Asym_Pause;
1589 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1590 ETHTOOL_SSET, "ETHTOOL_SSET");
1593 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1594 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1595 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1596 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1597 * sets '*vlan_vid' to -1. */
1599 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1601 const char *netdev_name = netdev_get_name(netdev);
1602 struct ds line = DS_EMPTY_INITIALIZER;
1603 FILE *stream = NULL;
1607 COVERAGE_INC(netdev_get_vlan_vid);
1608 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1609 stream = fopen(fn, "r");
1615 if (ds_get_line(&line, stream)) {
1616 if (ferror(stream)) {
1618 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1621 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1626 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1628 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1629 fn, ds_cstr(&line));
1647 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1648 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1650 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1651 * positive errno value.
1653 * This function is equivalent to running
1654 * /sbin/tc qdisc del dev %s handle ffff: ingress
1655 * but it is much, much faster.
1658 netdev_linux_remove_policing(struct netdev *netdev)
1660 struct netdev_dev_linux *netdev_dev =
1661 netdev_dev_linux_cast(netdev_get_dev(netdev));
1662 const char *netdev_name = netdev_get_name(netdev);
1664 struct ofpbuf request;
1665 struct tcmsg *tcmsg;
1668 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1672 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1673 tcmsg->tcm_parent = TC_H_INGRESS;
1674 nl_msg_put_string(&request, TCA_KIND, "ingress");
1675 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1677 error = tc_transact(&request, NULL);
1678 if (error && error != ENOENT && error != EINVAL) {
1679 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1680 netdev_name, strerror(error));
1684 netdev_dev->kbits_rate = 0;
1685 netdev_dev->kbits_burst = 0;
1686 netdev_dev->cache_valid |= VALID_POLICING;
1690 /* Attempts to set input rate limiting (policing) policy. */
1692 netdev_linux_set_policing(struct netdev *netdev,
1693 uint32_t kbits_rate, uint32_t kbits_burst)
1695 struct netdev_dev_linux *netdev_dev =
1696 netdev_dev_linux_cast(netdev_get_dev(netdev));
1697 const char *netdev_name = netdev_get_name(netdev);
1700 COVERAGE_INC(netdev_set_policing);
1702 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1703 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1704 : kbits_burst); /* Stick with user-specified value. */
1706 if (netdev_dev->cache_valid & VALID_POLICING
1707 && netdev_dev->kbits_rate == kbits_rate
1708 && netdev_dev->kbits_burst == kbits_burst) {
1709 /* Assume that settings haven't changed since we last set them. */
1713 netdev_linux_remove_policing(netdev);
1715 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1716 if (system(command) != 0) {
1717 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1721 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1722 kbits_rate, kbits_burst);
1723 if (system(command) != 0) {
1724 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1729 netdev_dev->kbits_rate = kbits_rate;
1730 netdev_dev->kbits_burst = kbits_burst;
1731 netdev_dev->cache_valid |= VALID_POLICING;
1738 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1741 const struct tc_ops **opsp;
1743 for (opsp = tcs; *opsp != NULL; opsp++) {
1744 const struct tc_ops *ops = *opsp;
1745 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1746 sset_add(types, ops->ovs_name);
1752 static const struct tc_ops *
1753 tc_lookup_ovs_name(const char *name)
1755 const struct tc_ops **opsp;
1757 for (opsp = tcs; *opsp != NULL; opsp++) {
1758 const struct tc_ops *ops = *opsp;
1759 if (!strcmp(name, ops->ovs_name)) {
1766 static const struct tc_ops *
1767 tc_lookup_linux_name(const char *name)
1769 const struct tc_ops **opsp;
1771 for (opsp = tcs; *opsp != NULL; opsp++) {
1772 const struct tc_ops *ops = *opsp;
1773 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1780 static struct tc_queue *
1781 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1784 struct netdev_dev_linux *netdev_dev =
1785 netdev_dev_linux_cast(netdev_get_dev(netdev));
1786 struct tc_queue *queue;
1788 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1789 if (queue->queue_id == queue_id) {
1796 static struct tc_queue *
1797 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1799 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1803 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1805 struct netdev_qos_capabilities *caps)
1807 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1811 caps->n_queues = ops->n_queues;
1816 netdev_linux_get_qos(const struct netdev *netdev,
1817 const char **typep, struct shash *details)
1819 struct netdev_dev_linux *netdev_dev =
1820 netdev_dev_linux_cast(netdev_get_dev(netdev));
1823 error = tc_query_qdisc(netdev);
1828 *typep = netdev_dev->tc->ops->ovs_name;
1829 return (netdev_dev->tc->ops->qdisc_get
1830 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1835 netdev_linux_set_qos(struct netdev *netdev,
1836 const char *type, const struct shash *details)
1838 struct netdev_dev_linux *netdev_dev =
1839 netdev_dev_linux_cast(netdev_get_dev(netdev));
1840 const struct tc_ops *new_ops;
1843 new_ops = tc_lookup_ovs_name(type);
1844 if (!new_ops || !new_ops->tc_install) {
1848 error = tc_query_qdisc(netdev);
1853 if (new_ops == netdev_dev->tc->ops) {
1854 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1856 /* Delete existing qdisc. */
1857 error = tc_del_qdisc(netdev);
1861 assert(netdev_dev->tc == NULL);
1863 /* Install new qdisc. */
1864 error = new_ops->tc_install(netdev, details);
1865 assert((error == 0) == (netdev_dev->tc != NULL));
1872 netdev_linux_get_queue(const struct netdev *netdev,
1873 unsigned int queue_id, struct shash *details)
1875 struct netdev_dev_linux *netdev_dev =
1876 netdev_dev_linux_cast(netdev_get_dev(netdev));
1879 error = tc_query_qdisc(netdev);
1883 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1885 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1891 netdev_linux_set_queue(struct netdev *netdev,
1892 unsigned int queue_id, const struct shash *details)
1894 struct netdev_dev_linux *netdev_dev =
1895 netdev_dev_linux_cast(netdev_get_dev(netdev));
1898 error = tc_query_qdisc(netdev);
1901 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1902 || !netdev_dev->tc->ops->class_set) {
1906 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1910 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1912 struct netdev_dev_linux *netdev_dev =
1913 netdev_dev_linux_cast(netdev_get_dev(netdev));
1916 error = tc_query_qdisc(netdev);
1919 } else if (!netdev_dev->tc->ops->class_delete) {
1922 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1924 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1930 netdev_linux_get_queue_stats(const struct netdev *netdev,
1931 unsigned int queue_id,
1932 struct netdev_queue_stats *stats)
1934 struct netdev_dev_linux *netdev_dev =
1935 netdev_dev_linux_cast(netdev_get_dev(netdev));
1938 error = tc_query_qdisc(netdev);
1941 } else if (!netdev_dev->tc->ops->class_get_stats) {
1944 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1946 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1952 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1954 struct ofpbuf request;
1955 struct tcmsg *tcmsg;
1957 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1961 tcmsg->tcm_parent = 0;
1962 nl_dump_start(dump, rtnl_sock, &request);
1963 ofpbuf_uninit(&request);
1968 netdev_linux_dump_queues(const struct netdev *netdev,
1969 netdev_dump_queues_cb *cb, void *aux)
1971 struct netdev_dev_linux *netdev_dev =
1972 netdev_dev_linux_cast(netdev_get_dev(netdev));
1973 struct tc_queue *queue;
1974 struct shash details;
1978 error = tc_query_qdisc(netdev);
1981 } else if (!netdev_dev->tc->ops->class_get) {
1986 shash_init(&details);
1987 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1988 shash_clear(&details);
1990 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1992 (*cb)(queue->queue_id, &details, aux);
1997 shash_destroy(&details);
2003 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2004 netdev_dump_queue_stats_cb *cb, void *aux)
2006 struct netdev_dev_linux *netdev_dev =
2007 netdev_dev_linux_cast(netdev_get_dev(netdev));
2008 struct nl_dump dump;
2013 error = tc_query_qdisc(netdev);
2016 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2021 if (!start_queue_dump(netdev, &dump)) {
2024 while (nl_dump_next(&dump, &msg)) {
2025 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2031 error = nl_dump_done(&dump);
2032 return error ? error : last_error;
2036 netdev_linux_get_in4(const struct netdev *netdev_,
2037 struct in_addr *address, struct in_addr *netmask)
2039 struct netdev_dev_linux *netdev_dev =
2040 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2042 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2045 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2046 SIOCGIFADDR, "SIOCGIFADDR");
2051 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2052 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2057 netdev_dev->cache_valid |= VALID_IN4;
2059 *address = netdev_dev->address;
2060 *netmask = netdev_dev->netmask;
2061 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2065 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2066 struct in_addr netmask)
2068 struct netdev_dev_linux *netdev_dev =
2069 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2072 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2074 netdev_dev->cache_valid |= VALID_IN4;
2075 netdev_dev->address = address;
2076 netdev_dev->netmask = netmask;
2077 if (address.s_addr != INADDR_ANY) {
2078 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2079 "SIOCSIFNETMASK", netmask);
2086 parse_if_inet6_line(const char *line,
2087 struct in6_addr *in6, char ifname[16 + 1])
2089 uint8_t *s6 = in6->s6_addr;
2090 #define X8 "%2"SCNx8
2092 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2093 "%*x %*x %*x %*x %16s\n",
2094 &s6[0], &s6[1], &s6[2], &s6[3],
2095 &s6[4], &s6[5], &s6[6], &s6[7],
2096 &s6[8], &s6[9], &s6[10], &s6[11],
2097 &s6[12], &s6[13], &s6[14], &s6[15],
2101 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2102 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2104 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2106 struct netdev_dev_linux *netdev_dev =
2107 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2108 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2112 netdev_dev->in6 = in6addr_any;
2114 file = fopen("/proc/net/if_inet6", "r");
2116 const char *name = netdev_get_name(netdev_);
2117 while (fgets(line, sizeof line, file)) {
2118 struct in6_addr in6_tmp;
2119 char ifname[16 + 1];
2120 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2121 && !strcmp(name, ifname))
2123 netdev_dev->in6 = in6_tmp;
2129 netdev_dev->cache_valid |= VALID_IN6;
2131 *in6 = netdev_dev->in6;
2136 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2138 struct sockaddr_in sin;
2139 memset(&sin, 0, sizeof sin);
2140 sin.sin_family = AF_INET;
2141 sin.sin_addr = addr;
2144 memset(sa, 0, sizeof *sa);
2145 memcpy(sa, &sin, sizeof sin);
2149 do_set_addr(struct netdev *netdev,
2150 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2153 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2154 make_in4_sockaddr(&ifr.ifr_addr, addr);
2156 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2160 /* Adds 'router' as a default IP gateway. */
2162 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2164 struct in_addr any = { INADDR_ANY };
2168 memset(&rt, 0, sizeof rt);
2169 make_in4_sockaddr(&rt.rt_dst, any);
2170 make_in4_sockaddr(&rt.rt_gateway, router);
2171 make_in4_sockaddr(&rt.rt_genmask, any);
2172 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2173 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2175 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2181 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2184 static const char fn[] = "/proc/net/route";
2189 *netdev_name = NULL;
2190 stream = fopen(fn, "r");
2191 if (stream == NULL) {
2192 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2197 while (fgets(line, sizeof line, stream)) {
2200 ovs_be32 dest, gateway, mask;
2201 int refcnt, metric, mtu;
2202 unsigned int flags, use, window, irtt;
2205 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2207 iface, &dest, &gateway, &flags, &refcnt,
2208 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2210 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2214 if (!(flags & RTF_UP)) {
2215 /* Skip routes that aren't up. */
2219 /* The output of 'dest', 'mask', and 'gateway' were given in
2220 * network byte order, so we don't need need any endian
2221 * conversions here. */
2222 if ((dest & mask) == (host->s_addr & mask)) {
2224 /* The host is directly reachable. */
2225 next_hop->s_addr = 0;
2227 /* To reach the host, we must go through a gateway. */
2228 next_hop->s_addr = gateway;
2230 *netdev_name = xstrdup(iface);
2242 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2244 struct ethtool_drvinfo drvinfo;
2247 memset(&drvinfo, 0, sizeof drvinfo);
2248 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2249 (struct ethtool_cmd *)&drvinfo,
2251 "ETHTOOL_GDRVINFO");
2253 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2254 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2255 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2261 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2262 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2263 * returns 0. Otherwise, it returns a positive errno value; in particular,
2264 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2266 netdev_linux_arp_lookup(const struct netdev *netdev,
2267 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2270 struct sockaddr_in sin;
2273 memset(&r, 0, sizeof r);
2274 memset(&sin, 0, sizeof sin);
2275 sin.sin_family = AF_INET;
2276 sin.sin_addr.s_addr = ip;
2278 memcpy(&r.arp_pa, &sin, sizeof sin);
2279 r.arp_ha.sa_family = ARPHRD_ETHER;
2281 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2282 COVERAGE_INC(netdev_arp_lookup);
2283 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2285 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2286 } else if (retval != ENXIO) {
2287 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2288 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2294 nd_to_iff_flags(enum netdev_flags nd)
2297 if (nd & NETDEV_UP) {
2300 if (nd & NETDEV_PROMISC) {
2307 iff_to_nd_flags(int iff)
2309 enum netdev_flags nd = 0;
2313 if (iff & IFF_PROMISC) {
2314 nd |= NETDEV_PROMISC;
2320 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2321 enum netdev_flags on, enum netdev_flags *old_flagsp)
2323 int old_flags, new_flags;
2326 error = get_flags(netdev, &old_flags);
2328 *old_flagsp = iff_to_nd_flags(old_flags);
2329 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2330 if (new_flags != old_flags) {
2331 error = set_flags(netdev, new_flags);
2338 netdev_linux_change_seq(const struct netdev *netdev)
2340 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2343 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, GET_STATS, SET_STATS) \
2347 netdev_linux_init, \
2349 netdev_linux_wait, \
2352 netdev_linux_destroy, \
2353 NULL, /* get_config */ \
2354 NULL, /* set_config */ \
2356 netdev_linux_open, \
2357 netdev_linux_close, \
2361 netdev_linux_listen, \
2362 netdev_linux_recv, \
2363 netdev_linux_recv_wait, \
2364 netdev_linux_drain, \
2366 netdev_linux_send, \
2367 netdev_linux_send_wait, \
2369 netdev_linux_set_etheraddr, \
2370 netdev_linux_get_etheraddr, \
2371 netdev_linux_get_mtu, \
2372 netdev_linux_set_mtu, \
2373 netdev_linux_get_ifindex, \
2374 netdev_linux_get_carrier, \
2375 netdev_linux_set_miimon_interval, \
2379 netdev_linux_get_features, \
2380 netdev_linux_set_advertisements, \
2381 netdev_linux_get_vlan_vid, \
2383 netdev_linux_set_policing, \
2384 netdev_linux_get_qos_types, \
2385 netdev_linux_get_qos_capabilities, \
2386 netdev_linux_get_qos, \
2387 netdev_linux_set_qos, \
2388 netdev_linux_get_queue, \
2389 netdev_linux_set_queue, \
2390 netdev_linux_delete_queue, \
2391 netdev_linux_get_queue_stats, \
2392 netdev_linux_dump_queues, \
2393 netdev_linux_dump_queue_stats, \
2395 netdev_linux_get_in4, \
2396 netdev_linux_set_in4, \
2397 netdev_linux_get_in6, \
2398 netdev_linux_add_router, \
2399 netdev_linux_get_next_hop, \
2400 netdev_linux_get_status, \
2401 netdev_linux_arp_lookup, \
2403 netdev_linux_update_flags, \
2405 netdev_linux_change_seq \
2408 const struct netdev_class netdev_linux_class =
2411 netdev_linux_create,
2412 netdev_linux_enumerate,
2413 netdev_linux_get_stats,
2414 NULL); /* set_stats */
2416 const struct netdev_class netdev_tap_class =
2419 netdev_linux_create_tap,
2420 NULL, /* enumerate */
2421 netdev_pseudo_get_stats,
2422 NULL); /* set_stats */
2424 const struct netdev_class netdev_internal_class =
2427 netdev_linux_create,
2428 NULL, /* enumerate */
2429 netdev_pseudo_get_stats,
2430 netdev_vport_set_stats);
2432 /* HTB traffic control class. */
2434 #define HTB_N_QUEUES 0xf000
2438 unsigned int max_rate; /* In bytes/s. */
2442 struct tc_queue tc_queue;
2443 unsigned int min_rate; /* In bytes/s. */
2444 unsigned int max_rate; /* In bytes/s. */
2445 unsigned int burst; /* In bytes. */
2446 unsigned int priority; /* Lower values are higher priorities. */
2450 htb_get__(const struct netdev *netdev)
2452 struct netdev_dev_linux *netdev_dev =
2453 netdev_dev_linux_cast(netdev_get_dev(netdev));
2454 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2458 htb_install__(struct netdev *netdev, uint64_t max_rate)
2460 struct netdev_dev_linux *netdev_dev =
2461 netdev_dev_linux_cast(netdev_get_dev(netdev));
2464 htb = xmalloc(sizeof *htb);
2465 tc_init(&htb->tc, &tc_ops_htb);
2466 htb->max_rate = max_rate;
2468 netdev_dev->tc = &htb->tc;
2471 /* Create an HTB qdisc.
2473 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2475 htb_setup_qdisc__(struct netdev *netdev)
2478 struct tc_htb_glob opt;
2479 struct ofpbuf request;
2480 struct tcmsg *tcmsg;
2482 tc_del_qdisc(netdev);
2484 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2485 NLM_F_EXCL | NLM_F_CREATE, &request);
2489 tcmsg->tcm_handle = tc_make_handle(1, 0);
2490 tcmsg->tcm_parent = TC_H_ROOT;
2492 nl_msg_put_string(&request, TCA_KIND, "htb");
2494 memset(&opt, 0, sizeof opt);
2495 opt.rate2quantum = 10;
2499 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2500 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2501 nl_msg_end_nested(&request, opt_offset);
2503 return tc_transact(&request, NULL);
2506 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2507 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2509 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2510 unsigned int parent, struct htb_class *class)
2513 struct tc_htb_opt opt;
2514 struct ofpbuf request;
2515 struct tcmsg *tcmsg;
2519 error = netdev_get_mtu(netdev, &mtu);
2521 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2522 netdev_get_name(netdev));
2526 memset(&opt, 0, sizeof opt);
2527 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2528 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2529 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2530 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2531 opt.prio = class->priority;
2533 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2537 tcmsg->tcm_handle = handle;
2538 tcmsg->tcm_parent = parent;
2540 nl_msg_put_string(&request, TCA_KIND, "htb");
2541 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2542 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2543 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2544 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2545 nl_msg_end_nested(&request, opt_offset);
2547 error = tc_transact(&request, NULL);
2549 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2550 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2551 netdev_get_name(netdev),
2552 tc_get_major(handle), tc_get_minor(handle),
2553 tc_get_major(parent), tc_get_minor(parent),
2554 class->min_rate, class->max_rate,
2555 class->burst, class->priority, strerror(error));
2560 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2561 * description of them into 'details'. The description complies with the
2562 * specification given in the vswitch database documentation for linux-htb
2565 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2567 static const struct nl_policy tca_htb_policy[] = {
2568 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2569 .min_len = sizeof(struct tc_htb_opt) },
2572 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2573 const struct tc_htb_opt *htb;
2575 if (!nl_parse_nested(nl_options, tca_htb_policy,
2576 attrs, ARRAY_SIZE(tca_htb_policy))) {
2577 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2581 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2582 class->min_rate = htb->rate.rate;
2583 class->max_rate = htb->ceil.rate;
2584 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2585 class->priority = htb->prio;
2590 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2591 struct htb_class *options,
2592 struct netdev_queue_stats *stats)
2594 struct nlattr *nl_options;
2595 unsigned int handle;
2598 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2599 if (!error && queue_id) {
2600 unsigned int major = tc_get_major(handle);
2601 unsigned int minor = tc_get_minor(handle);
2602 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2603 *queue_id = minor - 1;
2608 if (!error && options) {
2609 error = htb_parse_tca_options__(nl_options, options);
2615 htb_parse_qdisc_details__(struct netdev *netdev,
2616 const struct shash *details, struct htb_class *hc)
2618 const char *max_rate_s;
2620 max_rate_s = shash_find_data(details, "max-rate");
2621 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2622 if (!hc->max_rate) {
2625 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2626 hc->max_rate = netdev_features_to_bps(current) / 8;
2628 hc->min_rate = hc->max_rate;
2634 htb_parse_class_details__(struct netdev *netdev,
2635 const struct shash *details, struct htb_class *hc)
2637 const struct htb *htb = htb_get__(netdev);
2638 const char *min_rate_s = shash_find_data(details, "min-rate");
2639 const char *max_rate_s = shash_find_data(details, "max-rate");
2640 const char *burst_s = shash_find_data(details, "burst");
2641 const char *priority_s = shash_find_data(details, "priority");
2644 error = netdev_get_mtu(netdev, &mtu);
2646 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2647 netdev_get_name(netdev));
2651 /* HTB requires at least an mtu sized min-rate to send any traffic even
2652 * on uncongested links. */
2653 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2654 hc->min_rate = MAX(hc->min_rate, mtu);
2655 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2658 hc->max_rate = (max_rate_s
2659 ? strtoull(max_rate_s, NULL, 10) / 8
2661 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2662 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2666 * According to hints in the documentation that I've read, it is important
2667 * that 'burst' be at least as big as the largest frame that might be
2668 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2669 * but having it a bit too small is a problem. Since netdev_get_mtu()
2670 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2671 * the MTU. We actually add 64, instead of 14, as a guard against
2672 * additional headers get tacked on somewhere that we're not aware of. */
2673 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2674 hc->burst = MAX(hc->burst, mtu + 64);
2677 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2683 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2684 unsigned int parent, struct htb_class *options,
2685 struct netdev_queue_stats *stats)
2687 struct ofpbuf *reply;
2690 error = tc_query_class(netdev, handle, parent, &reply);
2692 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2693 ofpbuf_delete(reply);
2699 htb_tc_install(struct netdev *netdev, const struct shash *details)
2703 error = htb_setup_qdisc__(netdev);
2705 struct htb_class hc;
2707 htb_parse_qdisc_details__(netdev, details, &hc);
2708 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2709 tc_make_handle(1, 0), &hc);
2711 htb_install__(netdev, hc.max_rate);
2717 static struct htb_class *
2718 htb_class_cast__(const struct tc_queue *queue)
2720 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2724 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2725 const struct htb_class *hc)
2727 struct htb *htb = htb_get__(netdev);
2728 size_t hash = hash_int(queue_id, 0);
2729 struct tc_queue *queue;
2730 struct htb_class *hcp;
2732 queue = tc_find_queue__(netdev, queue_id, hash);
2734 hcp = htb_class_cast__(queue);
2736 hcp = xmalloc(sizeof *hcp);
2737 queue = &hcp->tc_queue;
2738 queue->queue_id = queue_id;
2739 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2742 hcp->min_rate = hc->min_rate;
2743 hcp->max_rate = hc->max_rate;
2744 hcp->burst = hc->burst;
2745 hcp->priority = hc->priority;
2749 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2752 struct nl_dump dump;
2753 struct htb_class hc;
2755 /* Get qdisc options. */
2757 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2758 htb_install__(netdev, hc.max_rate);
2761 if (!start_queue_dump(netdev, &dump)) {
2764 while (nl_dump_next(&dump, &msg)) {
2765 unsigned int queue_id;
2767 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2768 htb_update_queue__(netdev, queue_id, &hc);
2771 nl_dump_done(&dump);
2777 htb_tc_destroy(struct tc *tc)
2779 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2780 struct htb_class *hc, *next;
2782 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2783 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2791 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2793 const struct htb *htb = htb_get__(netdev);
2794 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2799 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2801 struct htb_class hc;
2804 htb_parse_qdisc_details__(netdev, details, &hc);
2805 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2806 tc_make_handle(1, 0), &hc);
2808 htb_get__(netdev)->max_rate = hc.max_rate;
2814 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2815 const struct tc_queue *queue, struct shash *details)
2817 const struct htb_class *hc = htb_class_cast__(queue);
2819 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2820 if (hc->min_rate != hc->max_rate) {
2821 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2823 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2825 shash_add(details, "priority", xasprintf("%u", hc->priority));
2831 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2832 const struct shash *details)
2834 struct htb_class hc;
2837 error = htb_parse_class_details__(netdev, details, &hc);
2842 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2843 tc_make_handle(1, 0xfffe), &hc);
2848 htb_update_queue__(netdev, queue_id, &hc);
2853 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2855 struct htb_class *hc = htb_class_cast__(queue);
2856 struct htb *htb = htb_get__(netdev);
2859 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2861 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2868 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2869 struct netdev_queue_stats *stats)
2871 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2872 tc_make_handle(1, 0xfffe), NULL, stats);
2876 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2877 const struct ofpbuf *nlmsg,
2878 netdev_dump_queue_stats_cb *cb, void *aux)
2880 struct netdev_queue_stats stats;
2881 unsigned int handle, major, minor;
2884 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2889 major = tc_get_major(handle);
2890 minor = tc_get_minor(handle);
2891 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2892 (*cb)(minor - 1, &stats, aux);
2897 static const struct tc_ops tc_ops_htb = {
2898 "htb", /* linux_name */
2899 "linux-htb", /* ovs_name */
2900 HTB_N_QUEUES, /* n_queues */
2909 htb_class_get_stats,
2910 htb_class_dump_stats
2913 /* "linux-hfsc" traffic control class. */
2915 #define HFSC_N_QUEUES 0xf000
2923 struct tc_queue tc_queue;
2928 static struct hfsc *
2929 hfsc_get__(const struct netdev *netdev)
2931 struct netdev_dev_linux *netdev_dev;
2932 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2933 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2936 static struct hfsc_class *
2937 hfsc_class_cast__(const struct tc_queue *queue)
2939 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2943 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2945 struct netdev_dev_linux * netdev_dev;
2948 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2949 hfsc = xmalloc(sizeof *hfsc);
2950 tc_init(&hfsc->tc, &tc_ops_hfsc);
2951 hfsc->max_rate = max_rate;
2952 netdev_dev->tc = &hfsc->tc;
2956 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2957 const struct hfsc_class *hc)
2961 struct hfsc_class *hcp;
2962 struct tc_queue *queue;
2964 hfsc = hfsc_get__(netdev);
2965 hash = hash_int(queue_id, 0);
2967 queue = tc_find_queue__(netdev, queue_id, hash);
2969 hcp = hfsc_class_cast__(queue);
2971 hcp = xmalloc(sizeof *hcp);
2972 queue = &hcp->tc_queue;
2973 queue->queue_id = queue_id;
2974 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2977 hcp->min_rate = hc->min_rate;
2978 hcp->max_rate = hc->max_rate;
2982 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2984 const struct tc_service_curve *rsc, *fsc, *usc;
2985 static const struct nl_policy tca_hfsc_policy[] = {
2987 .type = NL_A_UNSPEC,
2989 .min_len = sizeof(struct tc_service_curve),
2992 .type = NL_A_UNSPEC,
2994 .min_len = sizeof(struct tc_service_curve),
2997 .type = NL_A_UNSPEC,
2999 .min_len = sizeof(struct tc_service_curve),
3002 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3004 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3005 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3006 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3010 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3011 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3012 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3014 if (rsc->m1 != 0 || rsc->d != 0 ||
3015 fsc->m1 != 0 || fsc->d != 0 ||
3016 usc->m1 != 0 || usc->d != 0) {
3017 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3018 "Non-linear service curves are not supported.");
3022 if (rsc->m2 != fsc->m2) {
3023 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3024 "Real-time service curves are not supported ");
3028 if (rsc->m2 > usc->m2) {
3029 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3030 "Min-rate service curve is greater than "
3031 "the max-rate service curve.");
3035 class->min_rate = fsc->m2;
3036 class->max_rate = usc->m2;
3041 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3042 struct hfsc_class *options,
3043 struct netdev_queue_stats *stats)
3046 unsigned int handle;
3047 struct nlattr *nl_options;
3049 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3055 unsigned int major, minor;
3057 major = tc_get_major(handle);
3058 minor = tc_get_minor(handle);
3059 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3060 *queue_id = minor - 1;
3067 error = hfsc_parse_tca_options__(nl_options, options);
3074 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3075 unsigned int parent, struct hfsc_class *options,
3076 struct netdev_queue_stats *stats)
3079 struct ofpbuf *reply;
3081 error = tc_query_class(netdev, handle, parent, &reply);
3086 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3087 ofpbuf_delete(reply);
3092 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3093 struct hfsc_class *class)
3096 const char *max_rate_s;
3098 max_rate_s = shash_find_data(details, "max-rate");
3099 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3104 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3105 max_rate = netdev_features_to_bps(current) / 8;
3108 class->min_rate = max_rate;
3109 class->max_rate = max_rate;
3113 hfsc_parse_class_details__(struct netdev *netdev,
3114 const struct shash *details,
3115 struct hfsc_class * class)
3117 const struct hfsc *hfsc;
3118 uint32_t min_rate, max_rate;
3119 const char *min_rate_s, *max_rate_s;
3121 hfsc = hfsc_get__(netdev);
3122 min_rate_s = shash_find_data(details, "min-rate");
3123 max_rate_s = shash_find_data(details, "max-rate");
3125 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3126 min_rate = MAX(min_rate, 1);
3127 min_rate = MIN(min_rate, hfsc->max_rate);
3129 max_rate = (max_rate_s
3130 ? strtoull(max_rate_s, NULL, 10) / 8
3132 max_rate = MAX(max_rate, min_rate);
3133 max_rate = MIN(max_rate, hfsc->max_rate);
3135 class->min_rate = min_rate;
3136 class->max_rate = max_rate;
3141 /* Create an HFSC qdisc.
3143 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3145 hfsc_setup_qdisc__(struct netdev * netdev)
3147 struct tcmsg *tcmsg;
3148 struct ofpbuf request;
3149 struct tc_hfsc_qopt opt;
3151 tc_del_qdisc(netdev);
3153 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3154 NLM_F_EXCL | NLM_F_CREATE, &request);
3160 tcmsg->tcm_handle = tc_make_handle(1, 0);
3161 tcmsg->tcm_parent = TC_H_ROOT;
3163 memset(&opt, 0, sizeof opt);
3166 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3167 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3169 return tc_transact(&request, NULL);
3172 /* Create an HFSC class.
3174 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3175 * sc rate <min_rate> ul rate <max_rate>" */
3177 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3178 unsigned int parent, struct hfsc_class *class)
3182 struct tcmsg *tcmsg;
3183 struct ofpbuf request;
3184 struct tc_service_curve min, max;
3186 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3192 tcmsg->tcm_handle = handle;
3193 tcmsg->tcm_parent = parent;
3197 min.m2 = class->min_rate;
3201 max.m2 = class->max_rate;
3203 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3204 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3205 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3206 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3207 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3208 nl_msg_end_nested(&request, opt_offset);
3210 error = tc_transact(&request, NULL);
3212 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3213 "min-rate %ubps, max-rate %ubps (%s)",
3214 netdev_get_name(netdev),
3215 tc_get_major(handle), tc_get_minor(handle),
3216 tc_get_major(parent), tc_get_minor(parent),
3217 class->min_rate, class->max_rate, strerror(error));
3224 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3227 struct hfsc_class class;
3229 error = hfsc_setup_qdisc__(netdev);
3235 hfsc_parse_qdisc_details__(netdev, details, &class);
3236 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3237 tc_make_handle(1, 0), &class);
3243 hfsc_install__(netdev, class.max_rate);
3248 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3251 struct nl_dump dump;
3252 struct hfsc_class hc;
3255 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3256 hfsc_install__(netdev, hc.max_rate);
3258 if (!start_queue_dump(netdev, &dump)) {
3262 while (nl_dump_next(&dump, &msg)) {
3263 unsigned int queue_id;
3265 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3266 hfsc_update_queue__(netdev, queue_id, &hc);
3270 nl_dump_done(&dump);
3275 hfsc_tc_destroy(struct tc *tc)
3278 struct hfsc_class *hc, *next;
3280 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3282 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3283 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3292 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3294 const struct hfsc *hfsc;
3295 hfsc = hfsc_get__(netdev);
3296 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3301 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3304 struct hfsc_class class;
3306 hfsc_parse_qdisc_details__(netdev, details, &class);
3307 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3308 tc_make_handle(1, 0), &class);
3311 hfsc_get__(netdev)->max_rate = class.max_rate;
3318 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3319 const struct tc_queue *queue, struct shash *details)
3321 const struct hfsc_class *hc;
3323 hc = hfsc_class_cast__(queue);
3324 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3325 if (hc->min_rate != hc->max_rate) {
3326 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3332 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3333 const struct shash *details)
3336 struct hfsc_class class;
3338 error = hfsc_parse_class_details__(netdev, details, &class);
3343 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3344 tc_make_handle(1, 0xfffe), &class);
3349 hfsc_update_queue__(netdev, queue_id, &class);
3354 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3358 struct hfsc_class *hc;
3360 hc = hfsc_class_cast__(queue);
3361 hfsc = hfsc_get__(netdev);
3363 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3365 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3372 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3373 struct netdev_queue_stats *stats)
3375 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3376 tc_make_handle(1, 0xfffe), NULL, stats);
3380 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3381 const struct ofpbuf *nlmsg,
3382 netdev_dump_queue_stats_cb *cb, void *aux)
3384 struct netdev_queue_stats stats;
3385 unsigned int handle, major, minor;
3388 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3393 major = tc_get_major(handle);
3394 minor = tc_get_minor(handle);
3395 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3396 (*cb)(minor - 1, &stats, aux);
3401 static const struct tc_ops tc_ops_hfsc = {
3402 "hfsc", /* linux_name */
3403 "linux-hfsc", /* ovs_name */
3404 HFSC_N_QUEUES, /* n_queues */
3405 hfsc_tc_install, /* tc_install */
3406 hfsc_tc_load, /* tc_load */
3407 hfsc_tc_destroy, /* tc_destroy */
3408 hfsc_qdisc_get, /* qdisc_get */
3409 hfsc_qdisc_set, /* qdisc_set */
3410 hfsc_class_get, /* class_get */
3411 hfsc_class_set, /* class_set */
3412 hfsc_class_delete, /* class_delete */
3413 hfsc_class_get_stats, /* class_get_stats */
3414 hfsc_class_dump_stats /* class_dump_stats */
3417 /* "linux-default" traffic control class.
3419 * This class represents the default, unnamed Linux qdisc. It corresponds to
3420 * the "" (empty string) QoS type in the OVS database. */
3423 default_install__(struct netdev *netdev)
3425 struct netdev_dev_linux *netdev_dev =
3426 netdev_dev_linux_cast(netdev_get_dev(netdev));
3427 static struct tc *tc;
3430 tc = xmalloc(sizeof *tc);
3431 tc_init(tc, &tc_ops_default);
3433 netdev_dev->tc = tc;
3437 default_tc_install(struct netdev *netdev,
3438 const struct shash *details OVS_UNUSED)
3440 default_install__(netdev);
3445 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3447 default_install__(netdev);
3451 static const struct tc_ops tc_ops_default = {
3452 NULL, /* linux_name */
3457 NULL, /* tc_destroy */
3458 NULL, /* qdisc_get */
3459 NULL, /* qdisc_set */
3460 NULL, /* class_get */
3461 NULL, /* class_set */
3462 NULL, /* class_delete */
3463 NULL, /* class_get_stats */
3464 NULL /* class_dump_stats */
3467 /* "linux-other" traffic control class.
3472 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3474 struct netdev_dev_linux *netdev_dev =
3475 netdev_dev_linux_cast(netdev_get_dev(netdev));
3476 static struct tc *tc;
3479 tc = xmalloc(sizeof *tc);
3480 tc_init(tc, &tc_ops_other);
3482 netdev_dev->tc = tc;
3486 static const struct tc_ops tc_ops_other = {
3487 NULL, /* linux_name */
3488 "linux-other", /* ovs_name */
3490 NULL, /* tc_install */
3492 NULL, /* tc_destroy */
3493 NULL, /* qdisc_get */
3494 NULL, /* qdisc_set */
3495 NULL, /* class_get */
3496 NULL, /* class_set */
3497 NULL, /* class_delete */
3498 NULL, /* class_get_stats */
3499 NULL /* class_dump_stats */
3502 /* Traffic control. */
3504 /* Number of kernel "tc" ticks per second. */
3505 static double ticks_per_s;
3507 /* Number of kernel "jiffies" per second. This is used for the purpose of
3508 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3509 * one jiffy's worth of data.
3511 * There are two possibilities here:
3513 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3514 * approximate range of 100 to 1024. That means that we really need to
3515 * make sure that the qdisc can buffer that much data.
3517 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3518 * has finely granular timers and there's no need to fudge additional room
3519 * for buffers. (There's no extra effort needed to implement that: the
3520 * large 'buffer_hz' is used as a divisor, so practically any number will
3521 * come out as 0 in the division. Small integer results in the case of
3522 * really high dividends won't have any real effect anyhow.)
3524 static unsigned int buffer_hz;
3526 /* Returns tc handle 'major':'minor'. */
3528 tc_make_handle(unsigned int major, unsigned int minor)
3530 return TC_H_MAKE(major << 16, minor);
3533 /* Returns the major number from 'handle'. */
3535 tc_get_major(unsigned int handle)
3537 return TC_H_MAJ(handle) >> 16;
3540 /* Returns the minor number from 'handle'. */
3542 tc_get_minor(unsigned int handle)
3544 return TC_H_MIN(handle);
3547 static struct tcmsg *
3548 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3549 struct ofpbuf *request)
3551 struct tcmsg *tcmsg;
3555 error = get_ifindex(netdev, &ifindex);
3560 ofpbuf_init(request, 512);
3561 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3562 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3563 tcmsg->tcm_family = AF_UNSPEC;
3564 tcmsg->tcm_ifindex = ifindex;
3565 /* Caller should fill in tcmsg->tcm_handle. */
3566 /* Caller should fill in tcmsg->tcm_parent. */
3572 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3574 int error = nl_sock_transact(rtnl_sock, request, replyp);
3575 ofpbuf_uninit(request);
3582 /* The values in psched are not individually very meaningful, but they are
3583 * important. The tables below show some values seen in the wild.
3587 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3588 * (Before that, there are hints that it was 1000000000.)
3590 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3594 * -----------------------------------
3595 * [1] 000c8000 000f4240 000f4240 00000064
3596 * [2] 000003e8 00000400 000f4240 3b9aca00
3597 * [3] 000003e8 00000400 000f4240 3b9aca00
3598 * [4] 000003e8 00000400 000f4240 00000064
3599 * [5] 000003e8 00000040 000f4240 3b9aca00
3600 * [6] 000003e8 00000040 000f4240 000000f9
3602 * a b c d ticks_per_s buffer_hz
3603 * ------- --------- ---------- ------------- ----------- -------------
3604 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3605 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3606 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3607 * [4] 1,000 1,024 1,000,000 100 976,562 100
3608 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3609 * [6] 1,000 64 1,000,000 249 15,625,000 249
3611 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3612 * [2] 2.6.26-1-686-bigmem from Debian lenny
3613 * [3] 2.6.26-2-sparc64 from Debian lenny
3614 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3615 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3616 * [6] 2.6.34 from kernel.org on KVM
3618 static const char fn[] = "/proc/net/psched";
3619 unsigned int a, b, c, d;
3625 stream = fopen(fn, "r");
3627 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3631 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3632 VLOG_WARN("%s: read failed", fn);
3636 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3640 VLOG_WARN("%s: invalid scheduler parameters", fn);
3644 ticks_per_s = (double) a * c / b;
3648 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3651 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3654 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3655 * rate of 'rate' bytes per second. */
3657 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3662 return (rate * ticks) / ticks_per_s;
3665 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3666 * rate of 'rate' bytes per second. */
3668 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3673 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3676 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3677 * a transmission rate of 'rate' bytes per second. */
3679 tc_buffer_per_jiffy(unsigned int rate)
3684 return rate / buffer_hz;
3687 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3688 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3689 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3690 * stores NULL into it if it is absent.
3692 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3695 * Returns 0 if successful, otherwise a positive errno value. */
3697 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3698 struct nlattr **options)
3700 static const struct nl_policy tca_policy[] = {
3701 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3702 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3704 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3706 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3707 tca_policy, ta, ARRAY_SIZE(ta))) {
3708 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3713 *kind = nl_attr_get_string(ta[TCA_KIND]);
3717 *options = ta[TCA_OPTIONS];
3732 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3733 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3734 * into '*options', and its queue statistics into '*stats'. Any of the output
3735 * arguments may be null.
3737 * Returns 0 if successful, otherwise a positive errno value. */
3739 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3740 struct nlattr **options, struct netdev_queue_stats *stats)
3742 static const struct nl_policy tca_policy[] = {
3743 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3744 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3746 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3748 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3749 tca_policy, ta, ARRAY_SIZE(ta))) {
3750 VLOG_WARN_RL(&rl, "failed to parse class message");
3755 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3756 *handlep = tc->tcm_handle;
3760 *options = ta[TCA_OPTIONS];
3764 const struct gnet_stats_queue *gsq;
3765 struct gnet_stats_basic gsb;
3767 static const struct nl_policy stats_policy[] = {
3768 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3769 .min_len = sizeof gsb },
3770 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3771 .min_len = sizeof *gsq },
3773 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3775 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3776 sa, ARRAY_SIZE(sa))) {
3777 VLOG_WARN_RL(&rl, "failed to parse class stats");
3781 /* Alignment issues screw up the length of struct gnet_stats_basic on
3782 * some arch/bitsize combinations. Newer versions of Linux have a
3783 * struct gnet_stats_basic_packed, but we can't depend on that. The
3784 * easiest thing to do is just to make a copy. */
3785 memset(&gsb, 0, sizeof gsb);
3786 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3787 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3788 stats->tx_bytes = gsb.bytes;
3789 stats->tx_packets = gsb.packets;
3791 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3792 stats->tx_errors = gsq->drops;
3802 memset(stats, 0, sizeof *stats);
3807 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3810 tc_query_class(const struct netdev *netdev,
3811 unsigned int handle, unsigned int parent,
3812 struct ofpbuf **replyp)
3814 struct ofpbuf request;
3815 struct tcmsg *tcmsg;
3818 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3822 tcmsg->tcm_handle = handle;
3823 tcmsg->tcm_parent = parent;
3825 error = tc_transact(&request, replyp);
3827 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3828 netdev_get_name(netdev),
3829 tc_get_major(handle), tc_get_minor(handle),
3830 tc_get_major(parent), tc_get_minor(parent),
3836 /* Equivalent to "tc class del dev <name> handle <handle>". */
3838 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3840 struct ofpbuf request;
3841 struct tcmsg *tcmsg;
3844 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3848 tcmsg->tcm_handle = handle;
3849 tcmsg->tcm_parent = 0;
3851 error = tc_transact(&request, NULL);
3853 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3854 netdev_get_name(netdev),
3855 tc_get_major(handle), tc_get_minor(handle),
3861 /* Equivalent to "tc qdisc del dev <name> root". */
3863 tc_del_qdisc(struct netdev *netdev)
3865 struct netdev_dev_linux *netdev_dev =
3866 netdev_dev_linux_cast(netdev_get_dev(netdev));
3867 struct ofpbuf request;
3868 struct tcmsg *tcmsg;
3871 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3875 tcmsg->tcm_handle = tc_make_handle(1, 0);
3876 tcmsg->tcm_parent = TC_H_ROOT;
3878 error = tc_transact(&request, NULL);
3879 if (error == EINVAL) {
3880 /* EINVAL probably means that the default qdisc was in use, in which
3881 * case we've accomplished our purpose. */
3884 if (!error && netdev_dev->tc) {
3885 if (netdev_dev->tc->ops->tc_destroy) {
3886 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3888 netdev_dev->tc = NULL;
3893 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3894 * kernel to determine what they are. Returns 0 if successful, otherwise a
3895 * positive errno value. */
3897 tc_query_qdisc(const struct netdev *netdev)
3899 struct netdev_dev_linux *netdev_dev =
3900 netdev_dev_linux_cast(netdev_get_dev(netdev));
3901 struct ofpbuf request, *qdisc;
3902 const struct tc_ops *ops;
3903 struct tcmsg *tcmsg;
3907 if (netdev_dev->tc) {
3911 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3912 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3913 * 2.6.35 without that fix backported to it.
3915 * To avoid the OOPS, we must not make a request that would attempt to dump
3916 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3917 * few others. There are a few ways that I can see to do this, but most of
3918 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3919 * technique chosen here is to assume that any non-default qdisc that we
3920 * create will have a class with handle 1:0. The built-in qdiscs only have
3921 * a class with handle 0:0.
3923 * We could check for Linux 2.6.35+ and use a more straightforward method
3925 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3929 tcmsg->tcm_handle = tc_make_handle(1, 0);
3930 tcmsg->tcm_parent = 0;
3932 /* Figure out what tc class to instantiate. */
3933 error = tc_transact(&request, &qdisc);
3937 error = tc_parse_qdisc(qdisc, &kind, NULL);
3939 ops = &tc_ops_other;
3941 ops = tc_lookup_linux_name(kind);
3943 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3944 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3946 ops = &tc_ops_other;
3949 } else if (error == ENOENT) {
3950 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3951 * other entity that doesn't have a handle 1:0. We will assume
3952 * that it's the system default qdisc. */
3953 ops = &tc_ops_default;
3956 /* Who knows? Maybe the device got deleted. */
3957 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3958 netdev_get_name(netdev), strerror(error));
3959 ops = &tc_ops_other;
3962 /* Instantiate it. */
3963 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3964 assert((load_error == 0) == (netdev_dev->tc != NULL));
3965 ofpbuf_delete(qdisc);
3967 return error ? error : load_error;
3970 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3971 approximate the time to transmit packets of various lengths. For an MTU of
3972 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3973 represents two possible packet lengths; for a MTU of 513 through 1024, four
3974 possible lengths; and so on.
3976 Returns, for the specified 'mtu', the number of bits that packet lengths
3977 need to be shifted right to fit within such a 256-entry table. */
3979 tc_calc_cell_log(unsigned int mtu)
3984 mtu = ETH_PAYLOAD_MAX;
3986 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3988 for (cell_log = 0; mtu >= 256; cell_log++) {
3995 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3998 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4000 memset(rate, 0, sizeof *rate);
4001 rate->cell_log = tc_calc_cell_log(mtu);
4002 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4003 /* rate->cell_align = 0; */ /* distro headers. */
4004 rate->mpu = ETH_TOTAL_MIN;
4008 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4009 * attribute of the specified "type".
4011 * See tc_calc_cell_log() above for a description of "rtab"s. */
4013 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4018 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4019 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4020 unsigned packet_size = (i + 1) << rate->cell_log;
4021 if (packet_size < rate->mpu) {
4022 packet_size = rate->mpu;
4024 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4028 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4029 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4030 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4033 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4035 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4036 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4039 /* Copies 'src' into 'dst', performing format conversion in the process. */
4041 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4042 const struct rtnl_link_stats *src)
4044 dst->rx_packets = src->rx_packets;
4045 dst->tx_packets = src->tx_packets;
4046 dst->rx_bytes = src->rx_bytes;
4047 dst->tx_bytes = src->tx_bytes;
4048 dst->rx_errors = src->rx_errors;
4049 dst->tx_errors = src->tx_errors;
4050 dst->rx_dropped = src->rx_dropped;
4051 dst->tx_dropped = src->tx_dropped;
4052 dst->multicast = src->multicast;
4053 dst->collisions = src->collisions;
4054 dst->rx_length_errors = src->rx_length_errors;
4055 dst->rx_over_errors = src->rx_over_errors;
4056 dst->rx_crc_errors = src->rx_crc_errors;
4057 dst->rx_frame_errors = src->rx_frame_errors;
4058 dst->rx_fifo_errors = src->rx_fifo_errors;
4059 dst->rx_missed_errors = src->rx_missed_errors;
4060 dst->tx_aborted_errors = src->tx_aborted_errors;
4061 dst->tx_carrier_errors = src->tx_carrier_errors;
4062 dst->tx_fifo_errors = src->tx_fifo_errors;
4063 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4064 dst->tx_window_errors = src->tx_window_errors;
4068 /* Utility functions. */
4071 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4073 /* Policy for RTNLGRP_LINK messages.
4075 * There are *many* more fields in these messages, but currently we only
4076 * care about these fields. */
4077 static const struct nl_policy rtnlgrp_link_policy[] = {
4078 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4079 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4080 .min_len = sizeof(struct rtnl_link_stats) },
4083 struct ofpbuf request;
4084 struct ofpbuf *reply;
4085 struct ifinfomsg *ifi;
4086 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4089 ofpbuf_init(&request, 0);
4090 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4091 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4092 ifi->ifi_family = PF_UNSPEC;
4093 ifi->ifi_index = ifindex;
4094 error = nl_sock_transact(rtnl_sock, &request, &reply);
4095 ofpbuf_uninit(&request);
4100 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4101 rtnlgrp_link_policy,
4102 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4103 ofpbuf_delete(reply);
4107 if (!attrs[IFLA_STATS]) {
4108 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4109 ofpbuf_delete(reply);
4113 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4115 ofpbuf_delete(reply);
4121 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4123 static const char fn[] = "/proc/net/dev";
4128 stream = fopen(fn, "r");
4130 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4135 while (fgets(line, sizeof line, stream)) {
4138 #define X64 "%"SCNu64
4141 X64 X64 X64 X64 X64 X64 X64 "%*u"
4142 X64 X64 X64 X64 X64 X64 X64 "%*u",
4148 &stats->rx_fifo_errors,
4149 &stats->rx_frame_errors,
4155 &stats->tx_fifo_errors,
4157 &stats->tx_carrier_errors) != 15) {
4158 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4159 } else if (!strcmp(devname, netdev_name)) {
4160 stats->rx_length_errors = UINT64_MAX;
4161 stats->rx_over_errors = UINT64_MAX;
4162 stats->rx_crc_errors = UINT64_MAX;
4163 stats->rx_missed_errors = UINT64_MAX;
4164 stats->tx_aborted_errors = UINT64_MAX;
4165 stats->tx_heartbeat_errors = UINT64_MAX;
4166 stats->tx_window_errors = UINT64_MAX;
4172 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4178 get_flags(const struct netdev *netdev, int *flags)
4183 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4185 *flags = ifr.ifr_flags;
4190 set_flags(struct netdev *netdev, int flags)
4194 ifr.ifr_flags = flags;
4195 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4200 do_get_ifindex(const char *netdev_name)
4204 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4205 COVERAGE_INC(netdev_get_ifindex);
4206 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4207 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4208 netdev_name, strerror(errno));
4211 return ifr.ifr_ifindex;
4215 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4217 struct netdev_dev_linux *netdev_dev =
4218 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4220 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4221 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4225 netdev_dev->cache_valid |= VALID_IFINDEX;
4226 netdev_dev->ifindex = ifindex;
4228 *ifindexp = netdev_dev->ifindex;
4233 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4238 memset(&ifr, 0, sizeof ifr);
4239 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4240 COVERAGE_INC(netdev_get_hwaddr);
4241 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4242 /* ENODEV probably means that a vif disappeared asynchronously and
4243 * hasn't been removed from the database yet, so reduce the log level
4244 * to INFO for that case. */
4245 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4246 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4247 netdev_name, strerror(errno));
4250 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4251 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4252 VLOG_WARN("%s device has unknown hardware address family %d",
4253 netdev_name, hwaddr_family);
4255 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4260 set_etheraddr(const char *netdev_name, int hwaddr_family,
4261 const uint8_t mac[ETH_ADDR_LEN])
4265 memset(&ifr, 0, sizeof ifr);
4266 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4267 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4268 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4269 COVERAGE_INC(netdev_set_hwaddr);
4270 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4271 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4272 netdev_name, strerror(errno));
4279 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4280 int cmd, const char *cmd_name)
4284 memset(&ifr, 0, sizeof ifr);
4285 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4286 ifr.ifr_data = (caddr_t) ecmd;
4289 COVERAGE_INC(netdev_ethtool);
4290 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4293 if (errno != EOPNOTSUPP) {
4294 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4295 "failed: %s", cmd_name, name, strerror(errno));
4297 /* The device doesn't support this operation. That's pretty
4298 * common, so there's no point in logging anything. */
4304 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4305 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4307 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4308 const char *flag_name, bool enable)
4310 const char *netdev_name = netdev_get_name(netdev);
4311 struct ethtool_value evalue;
4315 memset(&evalue, 0, sizeof evalue);
4316 error = netdev_linux_do_ethtool(netdev_name,
4317 (struct ethtool_cmd *)&evalue,
4318 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4323 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4324 error = netdev_linux_do_ethtool(netdev_name,
4325 (struct ethtool_cmd *)&evalue,
4326 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4331 memset(&evalue, 0, sizeof evalue);
4332 error = netdev_linux_do_ethtool(netdev_name,
4333 (struct ethtool_cmd *)&evalue,
4334 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4339 if (new_flags != evalue.data) {
4340 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4341 "device %s failed", enable ? "enable" : "disable",
4342 flag_name, netdev_name);
4350 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4351 const char *cmd_name)
4353 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4354 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4355 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4363 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4364 int cmd, const char *cmd_name)
4369 ifr.ifr_addr.sa_family = AF_INET;
4370 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4372 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4373 *ip = sin->sin_addr;
4378 /* Returns an AF_PACKET raw socket or a negative errno value. */
4380 af_packet_sock(void)
4382 static int sock = INT_MIN;
4384 if (sock == INT_MIN) {
4385 sock = socket(AF_PACKET, SOCK_RAW, 0);
4387 set_nonblocking(sock);
4390 VLOG_ERR("failed to create packet socket: %s", strerror(errno));