2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier netdev_linux_cache_notifier;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_CARRIER = 1 << 5,
118 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
119 VALID_POLICING = 1 << 7,
120 VALID_HAVE_VPORT_STATS = 1 << 8
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct shash *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct shash *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct shash *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct shash *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
372 bool is_internal; /* Is this an openvswitch internal device? */
373 bool is_tap; /* Is this a tuntap device? */
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 bool have_vport_stats;
380 struct tap_state tap;
384 struct netdev_linux {
385 struct netdev netdev;
389 /* Sockets used for ioctl operations. */
390 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
392 /* A Netlink routing socket that is not subscribed to any multicast groups. */
393 static struct nl_sock *rtnl_sock;
395 /* This is set pretty low because we probably won't learn anything from the
396 * additional log messages. */
397 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
399 static int netdev_linux_init(void);
401 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
402 int cmd, const char *cmd_name);
403 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
404 const char *cmd_name);
405 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
406 int cmd, const char *cmd_name);
407 static int get_flags(const struct netdev *, int *flagsp);
408 static int set_flags(struct netdev *, int flags);
409 static int do_get_ifindex(const char *netdev_name);
410 static int get_ifindex(const struct netdev *, int *ifindexp);
411 static int do_set_addr(struct netdev *netdev,
412 int ioctl_nr, const char *ioctl_name,
413 struct in_addr addr);
414 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
415 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
416 const uint8_t[ETH_ADDR_LEN]);
417 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
418 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_notifier_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_notifier_wait();
483 netdev_linux_miimon_wait();
487 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
490 if (!dev->change_seq) {
493 dev->cache_valid = 0;
497 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
498 void *aux OVS_UNUSED)
500 struct netdev_dev_linux *dev;
502 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
504 const struct netdev_class *netdev_class =
505 netdev_dev_get_class(base_dev);
507 if (is_netdev_linux_class(netdev_class)) {
508 dev = netdev_dev_linux_cast(base_dev);
509 netdev_dev_linux_changed(dev);
513 struct shash device_shash;
514 struct shash_node *node;
516 shash_init(&device_shash);
517 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
518 SHASH_FOR_EACH (node, &device_shash) {
520 netdev_dev_linux_changed(dev);
522 shash_destroy(&device_shash);
526 /* Creates system and internal devices. */
528 netdev_linux_create(const struct netdev_class *class, const char *name,
529 struct netdev_dev **netdev_devp)
531 struct netdev_dev_linux *netdev_dev;
534 if (!cache_notifier_refcount) {
535 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
536 netdev_linux_cache_cb, NULL);
541 cache_notifier_refcount++;
543 netdev_dev = xzalloc(sizeof *netdev_dev);
544 netdev_dev->change_seq = 1;
545 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
547 *netdev_devp = &netdev_dev->netdev_dev;
551 /* For most types of netdevs we open the device for each call of
552 * netdev_open(). However, this is not the case with tap devices,
553 * since it is only possible to open the device once. In this
554 * situation we share a single file descriptor, and consequently
555 * buffers, across all readers. Therefore once data is read it will
556 * be unavailable to other reads for tap devices. */
558 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
559 const char *name, struct netdev_dev **netdev_devp)
561 struct netdev_dev_linux *netdev_dev;
562 struct tap_state *state;
563 static const char tap_dev[] = "/dev/net/tun";
567 netdev_dev = xzalloc(sizeof *netdev_dev);
568 state = &netdev_dev->state.tap;
570 /* Open tap device. */
571 state->fd = open(tap_dev, O_RDWR);
574 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
578 /* Create tap device. */
579 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
580 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
581 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
582 VLOG_WARN("%s: creating tap device failed: %s", name,
588 /* Make non-blocking. */
589 error = set_nonblocking(state->fd);
594 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
595 *netdev_devp = &netdev_dev->netdev_dev;
604 destroy_tap(struct netdev_dev_linux *netdev_dev)
606 struct tap_state *state = &netdev_dev->state.tap;
608 if (state->fd >= 0) {
613 /* Destroys the netdev device 'netdev_dev_'. */
615 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
617 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
618 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
620 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
621 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
624 if (class == &netdev_linux_class || class == &netdev_internal_class) {
625 cache_notifier_refcount--;
627 if (!cache_notifier_refcount) {
628 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
630 } else if (class == &netdev_tap_class) {
631 destroy_tap(netdev_dev);
640 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
642 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
643 struct netdev_linux *netdev;
644 enum netdev_flags flags;
647 /* Allocate network device. */
648 netdev = xzalloc(sizeof *netdev);
650 netdev_init(&netdev->netdev, netdev_dev_);
652 /* Verify that the device really exists, by attempting to read its flags.
653 * (The flags might be cached, in which case this won't actually do an
656 * Don't do this for "internal" netdevs, though, because those have to be
657 * created as netdev objects before they exist in the kernel, because
658 * creating them in the kernel happens by passing a netdev object to
659 * dpif_port_add(). */
660 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
661 error = netdev_get_flags(&netdev->netdev, &flags);
662 if (error == ENODEV) {
667 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
668 !netdev_dev->state.tap.opened) {
670 /* We assume that the first user of the tap device is the primary user
671 * and give them the tap FD. Subsequent users probably just expect
672 * this to be a system device so open it normally to avoid send/receive
673 * directions appearing to be reversed. */
674 netdev->fd = netdev_dev->state.tap.fd;
675 netdev_dev->state.tap.opened = true;
678 *netdevp = &netdev->netdev;
682 netdev_uninit(&netdev->netdev, true);
686 /* Closes and destroys 'netdev'. */
688 netdev_linux_close(struct netdev *netdev_)
690 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
692 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
698 /* Initializes 'sset' with a list of the names of all known network devices. */
700 netdev_linux_enumerate(struct sset *sset)
702 struct if_nameindex *names;
704 names = if_nameindex();
708 for (i = 0; names[i].if_name != NULL; i++) {
709 sset_add(sset, names[i].if_name);
711 if_freenameindex(names);
714 VLOG_WARN("could not obtain list of network device names: %s",
721 netdev_linux_listen(struct netdev *netdev_)
723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 struct sockaddr_ll sll;
729 if (netdev->fd >= 0) {
733 /* Create file descriptor. */
734 fd = socket(PF_PACKET, SOCK_RAW, 0);
737 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
741 /* Set non-blocking mode. */
742 error = set_nonblocking(fd);
747 /* Get ethernet device index. */
748 error = get_ifindex(&netdev->netdev, &ifindex);
753 /* Bind to specific ethernet device. */
754 memset(&sll, 0, sizeof sll);
755 sll.sll_family = AF_PACKET;
756 sll.sll_ifindex = ifindex;
757 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
758 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
760 VLOG_ERR("%s: failed to bind raw socket (%s)",
761 netdev_get_name(netdev_), strerror(error));
776 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
778 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
780 if (netdev->fd < 0) {
781 /* Device is not listening. */
786 ssize_t retval = read(netdev->fd, data, size);
789 } else if (errno != EINTR) {
790 if (errno != EAGAIN) {
791 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
792 strerror(errno), netdev_get_name(netdev_));
799 /* Registers with the poll loop to wake up from the next call to poll_block()
800 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
802 netdev_linux_recv_wait(struct netdev *netdev_)
804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
805 if (netdev->fd >= 0) {
806 poll_fd_wait(netdev->fd, POLLIN);
810 /* Discards all packets waiting to be received from 'netdev'. */
812 netdev_linux_drain(struct netdev *netdev_)
814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
815 if (netdev->fd < 0) {
817 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
819 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
820 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
824 drain_fd(netdev->fd, ifr.ifr_qlen);
827 return drain_rcvbuf(netdev->fd);
831 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
832 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
833 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
834 * the packet is too big or too small to transmit on the device.
836 * The caller retains ownership of 'buffer' in all cases.
838 * The kernel maintains a packet transmission queue, so the caller is not
839 * expected to do additional queuing of packets. */
841 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
843 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
847 if (netdev->fd < 0) {
848 /* Use our AF_PACKET socket to send to this device. */
849 struct sockaddr_ll sll;
856 sock = af_packet_sock();
861 error = get_ifindex(netdev_, &ifindex);
866 /* We don't bother setting most fields in sockaddr_ll because the
867 * kernel ignores them for SOCK_RAW. */
868 memset(&sll, 0, sizeof sll);
869 sll.sll_family = AF_PACKET;
870 sll.sll_ifindex = ifindex;
872 iov.iov_base = (void *) data;
876 msg.msg_namelen = sizeof sll;
879 msg.msg_control = NULL;
880 msg.msg_controllen = 0;
883 retval = sendmsg(sock, &msg, 0);
885 /* Use the netdev's own fd to send to this device. This is
886 * essential for tap devices, because packets sent to a tap device
887 * with an AF_PACKET socket will loop back to be *received* again
888 * on the tap device. */
889 retval = write(netdev->fd, data, size);
893 /* The Linux AF_PACKET implementation never blocks waiting for room
894 * for packets, instead returning ENOBUFS. Translate this into
895 * EAGAIN for the caller. */
896 if (errno == ENOBUFS) {
898 } else if (errno == EINTR) {
900 } else if (errno != EAGAIN) {
901 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
902 netdev_get_name(netdev_), strerror(errno));
905 } else if (retval != size) {
906 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
907 "%zu) on %s", retval, size, netdev_get_name(netdev_));
915 /* Registers with the poll loop to wake up from the next call to poll_block()
916 * when the packet transmission queue has sufficient room to transmit a packet
917 * with netdev_send().
919 * The kernel maintains a packet transmission queue, so the client is not
920 * expected to do additional queuing of packets. Thus, this function is
921 * unlikely to ever be used. It is included for completeness. */
923 netdev_linux_send_wait(struct netdev *netdev_)
925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
926 if (netdev->fd < 0) {
928 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
929 poll_fd_wait(netdev->fd, POLLOUT);
931 /* TAP device always accepts packets.*/
932 poll_immediate_wake();
936 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
937 * otherwise a positive errno value. */
939 netdev_linux_set_etheraddr(struct netdev *netdev_,
940 const uint8_t mac[ETH_ADDR_LEN])
942 struct netdev_dev_linux *netdev_dev =
943 netdev_dev_linux_cast(netdev_get_dev(netdev_));
946 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
947 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
948 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
950 netdev_dev->cache_valid |= VALID_ETHERADDR;
951 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
959 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
960 * free the returned buffer. */
962 netdev_linux_get_etheraddr(const struct netdev *netdev_,
963 uint8_t mac[ETH_ADDR_LEN])
965 struct netdev_dev_linux *netdev_dev =
966 netdev_dev_linux_cast(netdev_get_dev(netdev_));
967 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
968 int error = get_etheraddr(netdev_get_name(netdev_),
969 netdev_dev->etheraddr);
973 netdev_dev->cache_valid |= VALID_ETHERADDR;
975 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
979 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
980 * in bytes, not including the hardware header; thus, this is typically 1500
981 * bytes for Ethernet devices. */
983 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
985 struct netdev_dev_linux *netdev_dev =
986 netdev_dev_linux_cast(netdev_get_dev(netdev_));
987 if (!(netdev_dev->cache_valid & VALID_MTU)) {
991 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
992 SIOCGIFMTU, "SIOCGIFMTU");
996 netdev_dev->mtu = ifr.ifr_mtu;
997 netdev_dev->cache_valid |= VALID_MTU;
999 *mtup = netdev_dev->mtu;
1003 /* Sets the maximum size of transmitted (MTU) for given device using linux
1004 * networking ioctl interface.
1007 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1009 struct netdev_dev_linux *netdev_dev =
1010 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1015 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1016 SIOCSIFMTU, "SIOCSIFMTU");
1021 netdev_dev->mtu = ifr.ifr_mtu;
1022 netdev_dev->cache_valid |= VALID_MTU;
1026 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1027 * On failure, returns a negative errno value. */
1029 netdev_linux_get_ifindex(const struct netdev *netdev)
1033 error = get_ifindex(netdev, &ifindex);
1034 return error ? -error : ifindex;
1038 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1040 struct netdev_dev_linux *netdev_dev =
1041 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1046 if (netdev_dev->miimon_interval > 0) {
1047 *carrier = netdev_dev->miimon;
1051 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1055 fn = xasprintf("/sys/class/net/%s/carrier",
1056 netdev_get_name(netdev_));
1057 fd = open(fn, O_RDONLY);
1060 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1064 retval = read(fd, line, sizeof line);
1067 if (error == EINVAL) {
1068 /* This is the normal return value when we try to check carrier
1069 * if the network device is not up. */
1071 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1074 } else if (retval == 0) {
1076 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1080 if (line[0] != '0' && line[0] != '1') {
1082 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1086 netdev_dev->carrier = line[0] != '0';
1087 netdev_dev->cache_valid |= VALID_CARRIER;
1089 *carrier = netdev_dev->carrier;
1101 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1102 struct mii_ioctl_data *data)
1107 memset(&ifr, 0, sizeof ifr);
1108 memcpy(&ifr.ifr_data, data, sizeof *data);
1109 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1110 memcpy(data, &ifr.ifr_data, sizeof *data);
1116 netdev_linux_get_miimon(const char *name, bool *miimon)
1118 struct mii_ioctl_data data;
1123 memset(&data, 0, sizeof data);
1124 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1126 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1127 data.reg_num = MII_BMSR;
1128 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1132 *miimon = !!(data.val_out & BMSR_LSTATUS);
1134 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1137 struct ethtool_cmd ecmd;
1139 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1142 memset(&ecmd, 0, sizeof ecmd);
1143 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1146 struct ethtool_value eval;
1148 memcpy(&eval, &ecmd, sizeof eval);
1149 *miimon = !!eval.data;
1151 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1159 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1160 long long int interval)
1162 struct netdev_dev_linux *netdev_dev;
1164 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1166 interval = interval > 0 ? MAX(interval, 100) : 0;
1167 if (netdev_dev->miimon_interval != interval) {
1168 netdev_dev->miimon_interval = interval;
1169 timer_set_expired(&netdev_dev->miimon_timer);
1176 netdev_linux_miimon_run(void)
1178 struct shash device_shash;
1179 struct shash_node *node;
1181 shash_init(&device_shash);
1182 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1183 SHASH_FOR_EACH (node, &device_shash) {
1184 struct netdev_dev_linux *dev = node->data;
1187 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1191 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1192 if (miimon != dev->miimon) {
1193 dev->miimon = miimon;
1194 netdev_dev_linux_changed(dev);
1197 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1200 shash_destroy(&device_shash);
1204 netdev_linux_miimon_wait(void)
1206 struct shash device_shash;
1207 struct shash_node *node;
1209 shash_init(&device_shash);
1210 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1211 SHASH_FOR_EACH (node, &device_shash) {
1212 struct netdev_dev_linux *dev = node->data;
1214 if (dev->miimon_interval > 0) {
1215 timer_wait(&dev->miimon_timer);
1218 shash_destroy(&device_shash);
1221 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1222 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1225 check_for_working_netlink_stats(void)
1227 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1228 * preferable, so if that works, we'll use it. */
1229 int ifindex = do_get_ifindex("lo");
1231 VLOG_WARN("failed to get ifindex for lo, "
1232 "obtaining netdev stats from proc");
1235 struct netdev_stats stats;
1236 int error = get_stats_via_netlink(ifindex, &stats);
1238 VLOG_DBG("obtaining netdev stats via rtnetlink");
1241 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1242 "via proc (you are probably running a pre-2.6.19 "
1243 "kernel)", strerror(error));
1249 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1251 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1253 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1254 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1255 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1257 netdev_dev->is_tap = !strcmp(type, "tap");
1258 netdev_dev->is_internal = (!netdev_dev->is_tap
1259 && dpif_linux_is_internal_device(name));
1260 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1265 swap_uint64(uint64_t *a, uint64_t *b)
1272 /* Retrieves current device stats for 'netdev'. */
1274 netdev_linux_get_stats(const struct netdev *netdev_,
1275 struct netdev_stats *stats)
1277 struct netdev_dev_linux *netdev_dev =
1278 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1279 static int use_netlink_stats = -1;
1282 if (netdev_dev->have_vport_stats ||
1283 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1285 error = netdev_vport_get_stats(netdev_, stats);
1286 netdev_dev->have_vport_stats = !error;
1287 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1290 if (!netdev_dev->have_vport_stats) {
1291 if (use_netlink_stats < 0) {
1292 use_netlink_stats = check_for_working_netlink_stats();
1294 if (use_netlink_stats) {
1297 error = get_ifindex(netdev_, &ifindex);
1299 error = get_stats_via_netlink(ifindex, stats);
1302 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1306 /* If this port is an internal port then the transmit and receive stats
1307 * will appear to be swapped relative to the other ports since we are the
1308 * one sending the data, not a remote computer. For consistency, we swap
1309 * them back here. This does not apply if we are getting stats from the
1310 * vport layer because it always tracks stats from the perspective of the
1312 netdev_linux_update_is_pseudo(netdev_dev);
1313 if (!error && !netdev_dev->have_vport_stats &&
1314 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1315 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1316 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1317 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1318 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1319 stats->rx_length_errors = 0;
1320 stats->rx_over_errors = 0;
1321 stats->rx_crc_errors = 0;
1322 stats->rx_frame_errors = 0;
1323 stats->rx_fifo_errors = 0;
1324 stats->rx_missed_errors = 0;
1325 stats->tx_aborted_errors = 0;
1326 stats->tx_carrier_errors = 0;
1327 stats->tx_fifo_errors = 0;
1328 stats->tx_heartbeat_errors = 0;
1329 stats->tx_window_errors = 0;
1335 /* Stores the features supported by 'netdev' into each of '*current',
1336 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1337 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1338 * successful, otherwise a positive errno value. */
1340 netdev_linux_get_features(const struct netdev *netdev,
1341 uint32_t *current, uint32_t *advertised,
1342 uint32_t *supported, uint32_t *peer)
1344 struct ethtool_cmd ecmd;
1347 memset(&ecmd, 0, sizeof ecmd);
1348 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1349 ETHTOOL_GSET, "ETHTOOL_GSET");
1354 /* Supported features. */
1356 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1357 *supported |= OFPPF_10MB_HD;
1359 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1360 *supported |= OFPPF_10MB_FD;
1362 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1363 *supported |= OFPPF_100MB_HD;
1365 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1366 *supported |= OFPPF_100MB_FD;
1368 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1369 *supported |= OFPPF_1GB_HD;
1371 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1372 *supported |= OFPPF_1GB_FD;
1374 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1375 *supported |= OFPPF_10GB_FD;
1377 if (ecmd.supported & SUPPORTED_TP) {
1378 *supported |= OFPPF_COPPER;
1380 if (ecmd.supported & SUPPORTED_FIBRE) {
1381 *supported |= OFPPF_FIBER;
1383 if (ecmd.supported & SUPPORTED_Autoneg) {
1384 *supported |= OFPPF_AUTONEG;
1386 if (ecmd.supported & SUPPORTED_Pause) {
1387 *supported |= OFPPF_PAUSE;
1389 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1390 *supported |= OFPPF_PAUSE_ASYM;
1393 /* Advertised features. */
1395 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1396 *advertised |= OFPPF_10MB_HD;
1398 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1399 *advertised |= OFPPF_10MB_FD;
1401 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1402 *advertised |= OFPPF_100MB_HD;
1404 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1405 *advertised |= OFPPF_100MB_FD;
1407 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1408 *advertised |= OFPPF_1GB_HD;
1410 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1411 *advertised |= OFPPF_1GB_FD;
1413 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1414 *advertised |= OFPPF_10GB_FD;
1416 if (ecmd.advertising & ADVERTISED_TP) {
1417 *advertised |= OFPPF_COPPER;
1419 if (ecmd.advertising & ADVERTISED_FIBRE) {
1420 *advertised |= OFPPF_FIBER;
1422 if (ecmd.advertising & ADVERTISED_Autoneg) {
1423 *advertised |= OFPPF_AUTONEG;
1425 if (ecmd.advertising & ADVERTISED_Pause) {
1426 *advertised |= OFPPF_PAUSE;
1428 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1429 *advertised |= OFPPF_PAUSE_ASYM;
1432 /* Current settings. */
1433 if (ecmd.speed == SPEED_10) {
1434 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1435 } else if (ecmd.speed == SPEED_100) {
1436 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1437 } else if (ecmd.speed == SPEED_1000) {
1438 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1439 } else if (ecmd.speed == SPEED_10000) {
1440 *current = OFPPF_10GB_FD;
1445 if (ecmd.port == PORT_TP) {
1446 *current |= OFPPF_COPPER;
1447 } else if (ecmd.port == PORT_FIBRE) {
1448 *current |= OFPPF_FIBER;
1452 *current |= OFPPF_AUTONEG;
1455 /* Peer advertisements. */
1456 *peer = 0; /* XXX */
1461 /* Set the features advertised by 'netdev' to 'advertise'. */
1463 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1465 struct ethtool_cmd ecmd;
1468 memset(&ecmd, 0, sizeof ecmd);
1469 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1470 ETHTOOL_GSET, "ETHTOOL_GSET");
1475 ecmd.advertising = 0;
1476 if (advertise & OFPPF_10MB_HD) {
1477 ecmd.advertising |= ADVERTISED_10baseT_Half;
1479 if (advertise & OFPPF_10MB_FD) {
1480 ecmd.advertising |= ADVERTISED_10baseT_Full;
1482 if (advertise & OFPPF_100MB_HD) {
1483 ecmd.advertising |= ADVERTISED_100baseT_Half;
1485 if (advertise & OFPPF_100MB_FD) {
1486 ecmd.advertising |= ADVERTISED_100baseT_Full;
1488 if (advertise & OFPPF_1GB_HD) {
1489 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1491 if (advertise & OFPPF_1GB_FD) {
1492 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1494 if (advertise & OFPPF_10GB_FD) {
1495 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1497 if (advertise & OFPPF_COPPER) {
1498 ecmd.advertising |= ADVERTISED_TP;
1500 if (advertise & OFPPF_FIBER) {
1501 ecmd.advertising |= ADVERTISED_FIBRE;
1503 if (advertise & OFPPF_AUTONEG) {
1504 ecmd.advertising |= ADVERTISED_Autoneg;
1506 if (advertise & OFPPF_PAUSE) {
1507 ecmd.advertising |= ADVERTISED_Pause;
1509 if (advertise & OFPPF_PAUSE_ASYM) {
1510 ecmd.advertising |= ADVERTISED_Asym_Pause;
1512 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1513 ETHTOOL_SSET, "ETHTOOL_SSET");
1516 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1517 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1518 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1519 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1520 * sets '*vlan_vid' to -1. */
1522 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1524 const char *netdev_name = netdev_get_name(netdev);
1525 struct ds line = DS_EMPTY_INITIALIZER;
1526 FILE *stream = NULL;
1530 COVERAGE_INC(netdev_get_vlan_vid);
1531 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1532 stream = fopen(fn, "r");
1538 if (ds_get_line(&line, stream)) {
1539 if (ferror(stream)) {
1541 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1544 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1549 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1551 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1552 fn, ds_cstr(&line));
1570 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1571 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1573 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1574 * positive errno value.
1576 * This function is equivalent to running
1577 * /sbin/tc qdisc del dev %s handle ffff: ingress
1578 * but it is much, much faster.
1581 netdev_linux_remove_policing(struct netdev *netdev)
1583 struct netdev_dev_linux *netdev_dev =
1584 netdev_dev_linux_cast(netdev_get_dev(netdev));
1585 const char *netdev_name = netdev_get_name(netdev);
1587 struct ofpbuf request;
1588 struct tcmsg *tcmsg;
1591 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1595 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1596 tcmsg->tcm_parent = TC_H_INGRESS;
1597 nl_msg_put_string(&request, TCA_KIND, "ingress");
1598 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1600 error = tc_transact(&request, NULL);
1601 if (error && error != ENOENT && error != EINVAL) {
1602 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1603 netdev_name, strerror(error));
1607 netdev_dev->kbits_rate = 0;
1608 netdev_dev->kbits_burst = 0;
1609 netdev_dev->cache_valid |= VALID_POLICING;
1613 /* Attempts to set input rate limiting (policing) policy. */
1615 netdev_linux_set_policing(struct netdev *netdev,
1616 uint32_t kbits_rate, uint32_t kbits_burst)
1618 struct netdev_dev_linux *netdev_dev =
1619 netdev_dev_linux_cast(netdev_get_dev(netdev));
1620 const char *netdev_name = netdev_get_name(netdev);
1623 COVERAGE_INC(netdev_set_policing);
1625 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1626 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1627 : kbits_burst); /* Stick with user-specified value. */
1629 if (netdev_dev->cache_valid & VALID_POLICING
1630 && netdev_dev->kbits_rate == kbits_rate
1631 && netdev_dev->kbits_burst == kbits_burst) {
1632 /* Assume that settings haven't changed since we last set them. */
1636 netdev_linux_remove_policing(netdev);
1638 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1639 if (system(command) != 0) {
1640 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1644 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1645 kbits_rate, kbits_burst);
1646 if (system(command) != 0) {
1647 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1652 netdev_dev->kbits_rate = kbits_rate;
1653 netdev_dev->kbits_burst = kbits_burst;
1654 netdev_dev->cache_valid |= VALID_POLICING;
1661 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1664 const struct tc_ops **opsp;
1666 for (opsp = tcs; *opsp != NULL; opsp++) {
1667 const struct tc_ops *ops = *opsp;
1668 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1669 sset_add(types, ops->ovs_name);
1675 static const struct tc_ops *
1676 tc_lookup_ovs_name(const char *name)
1678 const struct tc_ops **opsp;
1680 for (opsp = tcs; *opsp != NULL; opsp++) {
1681 const struct tc_ops *ops = *opsp;
1682 if (!strcmp(name, ops->ovs_name)) {
1689 static const struct tc_ops *
1690 tc_lookup_linux_name(const char *name)
1692 const struct tc_ops **opsp;
1694 for (opsp = tcs; *opsp != NULL; opsp++) {
1695 const struct tc_ops *ops = *opsp;
1696 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1703 static struct tc_queue *
1704 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1707 struct netdev_dev_linux *netdev_dev =
1708 netdev_dev_linux_cast(netdev_get_dev(netdev));
1709 struct tc_queue *queue;
1711 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1712 if (queue->queue_id == queue_id) {
1719 static struct tc_queue *
1720 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1722 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1726 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1728 struct netdev_qos_capabilities *caps)
1730 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1734 caps->n_queues = ops->n_queues;
1739 netdev_linux_get_qos(const struct netdev *netdev,
1740 const char **typep, struct shash *details)
1742 struct netdev_dev_linux *netdev_dev =
1743 netdev_dev_linux_cast(netdev_get_dev(netdev));
1746 error = tc_query_qdisc(netdev);
1751 *typep = netdev_dev->tc->ops->ovs_name;
1752 return (netdev_dev->tc->ops->qdisc_get
1753 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1758 netdev_linux_set_qos(struct netdev *netdev,
1759 const char *type, const struct shash *details)
1761 struct netdev_dev_linux *netdev_dev =
1762 netdev_dev_linux_cast(netdev_get_dev(netdev));
1763 const struct tc_ops *new_ops;
1766 new_ops = tc_lookup_ovs_name(type);
1767 if (!new_ops || !new_ops->tc_install) {
1771 error = tc_query_qdisc(netdev);
1776 if (new_ops == netdev_dev->tc->ops) {
1777 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1779 /* Delete existing qdisc. */
1780 error = tc_del_qdisc(netdev);
1784 assert(netdev_dev->tc == NULL);
1786 /* Install new qdisc. */
1787 error = new_ops->tc_install(netdev, details);
1788 assert((error == 0) == (netdev_dev->tc != NULL));
1795 netdev_linux_get_queue(const struct netdev *netdev,
1796 unsigned int queue_id, struct shash *details)
1798 struct netdev_dev_linux *netdev_dev =
1799 netdev_dev_linux_cast(netdev_get_dev(netdev));
1802 error = tc_query_qdisc(netdev);
1806 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1808 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1814 netdev_linux_set_queue(struct netdev *netdev,
1815 unsigned int queue_id, const struct shash *details)
1817 struct netdev_dev_linux *netdev_dev =
1818 netdev_dev_linux_cast(netdev_get_dev(netdev));
1821 error = tc_query_qdisc(netdev);
1824 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1825 || !netdev_dev->tc->ops->class_set) {
1829 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1833 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1835 struct netdev_dev_linux *netdev_dev =
1836 netdev_dev_linux_cast(netdev_get_dev(netdev));
1839 error = tc_query_qdisc(netdev);
1842 } else if (!netdev_dev->tc->ops->class_delete) {
1845 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1847 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1853 netdev_linux_get_queue_stats(const struct netdev *netdev,
1854 unsigned int queue_id,
1855 struct netdev_queue_stats *stats)
1857 struct netdev_dev_linux *netdev_dev =
1858 netdev_dev_linux_cast(netdev_get_dev(netdev));
1861 error = tc_query_qdisc(netdev);
1864 } else if (!netdev_dev->tc->ops->class_get_stats) {
1867 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1869 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1875 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1877 struct ofpbuf request;
1878 struct tcmsg *tcmsg;
1880 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1884 tcmsg->tcm_parent = 0;
1885 nl_dump_start(dump, rtnl_sock, &request);
1886 ofpbuf_uninit(&request);
1891 netdev_linux_dump_queues(const struct netdev *netdev,
1892 netdev_dump_queues_cb *cb, void *aux)
1894 struct netdev_dev_linux *netdev_dev =
1895 netdev_dev_linux_cast(netdev_get_dev(netdev));
1896 struct tc_queue *queue;
1897 struct shash details;
1901 error = tc_query_qdisc(netdev);
1904 } else if (!netdev_dev->tc->ops->class_get) {
1909 shash_init(&details);
1910 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1911 shash_clear(&details);
1913 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1915 (*cb)(queue->queue_id, &details, aux);
1920 shash_destroy(&details);
1926 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1927 netdev_dump_queue_stats_cb *cb, void *aux)
1929 struct netdev_dev_linux *netdev_dev =
1930 netdev_dev_linux_cast(netdev_get_dev(netdev));
1931 struct nl_dump dump;
1936 error = tc_query_qdisc(netdev);
1939 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1944 if (!start_queue_dump(netdev, &dump)) {
1947 while (nl_dump_next(&dump, &msg)) {
1948 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1954 error = nl_dump_done(&dump);
1955 return error ? error : last_error;
1959 netdev_linux_get_in4(const struct netdev *netdev_,
1960 struct in_addr *address, struct in_addr *netmask)
1962 struct netdev_dev_linux *netdev_dev =
1963 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1965 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1968 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1969 SIOCGIFADDR, "SIOCGIFADDR");
1974 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1975 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1980 netdev_dev->cache_valid |= VALID_IN4;
1982 *address = netdev_dev->address;
1983 *netmask = netdev_dev->netmask;
1984 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1988 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1989 struct in_addr netmask)
1991 struct netdev_dev_linux *netdev_dev =
1992 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1995 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1997 netdev_dev->cache_valid |= VALID_IN4;
1998 netdev_dev->address = address;
1999 netdev_dev->netmask = netmask;
2000 if (address.s_addr != INADDR_ANY) {
2001 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2002 "SIOCSIFNETMASK", netmask);
2009 parse_if_inet6_line(const char *line,
2010 struct in6_addr *in6, char ifname[16 + 1])
2012 uint8_t *s6 = in6->s6_addr;
2013 #define X8 "%2"SCNx8
2015 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2016 "%*x %*x %*x %*x %16s\n",
2017 &s6[0], &s6[1], &s6[2], &s6[3],
2018 &s6[4], &s6[5], &s6[6], &s6[7],
2019 &s6[8], &s6[9], &s6[10], &s6[11],
2020 &s6[12], &s6[13], &s6[14], &s6[15],
2024 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2025 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2027 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2029 struct netdev_dev_linux *netdev_dev =
2030 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2031 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2035 netdev_dev->in6 = in6addr_any;
2037 file = fopen("/proc/net/if_inet6", "r");
2039 const char *name = netdev_get_name(netdev_);
2040 while (fgets(line, sizeof line, file)) {
2041 struct in6_addr in6_tmp;
2042 char ifname[16 + 1];
2043 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2044 && !strcmp(name, ifname))
2046 netdev_dev->in6 = in6_tmp;
2052 netdev_dev->cache_valid |= VALID_IN6;
2054 *in6 = netdev_dev->in6;
2059 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2061 struct sockaddr_in sin;
2062 memset(&sin, 0, sizeof sin);
2063 sin.sin_family = AF_INET;
2064 sin.sin_addr = addr;
2067 memset(sa, 0, sizeof *sa);
2068 memcpy(sa, &sin, sizeof sin);
2072 do_set_addr(struct netdev *netdev,
2073 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2076 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2077 make_in4_sockaddr(&ifr.ifr_addr, addr);
2079 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2083 /* Adds 'router' as a default IP gateway. */
2085 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2087 struct in_addr any = { INADDR_ANY };
2091 memset(&rt, 0, sizeof rt);
2092 make_in4_sockaddr(&rt.rt_dst, any);
2093 make_in4_sockaddr(&rt.rt_gateway, router);
2094 make_in4_sockaddr(&rt.rt_genmask, any);
2095 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2096 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2098 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2104 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2107 static const char fn[] = "/proc/net/route";
2112 *netdev_name = NULL;
2113 stream = fopen(fn, "r");
2114 if (stream == NULL) {
2115 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2120 while (fgets(line, sizeof line, stream)) {
2123 ovs_be32 dest, gateway, mask;
2124 int refcnt, metric, mtu;
2125 unsigned int flags, use, window, irtt;
2128 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2130 iface, &dest, &gateway, &flags, &refcnt,
2131 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2133 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2137 if (!(flags & RTF_UP)) {
2138 /* Skip routes that aren't up. */
2142 /* The output of 'dest', 'mask', and 'gateway' were given in
2143 * network byte order, so we don't need need any endian
2144 * conversions here. */
2145 if ((dest & mask) == (host->s_addr & mask)) {
2147 /* The host is directly reachable. */
2148 next_hop->s_addr = 0;
2150 /* To reach the host, we must go through a gateway. */
2151 next_hop->s_addr = gateway;
2153 *netdev_name = xstrdup(iface);
2165 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2167 struct ethtool_drvinfo drvinfo;
2170 memset(&drvinfo, 0, sizeof drvinfo);
2171 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2172 (struct ethtool_cmd *)&drvinfo,
2174 "ETHTOOL_GDRVINFO");
2176 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2177 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2178 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2184 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2185 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2186 * returns 0. Otherwise, it returns a positive errno value; in particular,
2187 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2189 netdev_linux_arp_lookup(const struct netdev *netdev,
2190 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2193 struct sockaddr_in sin;
2196 memset(&r, 0, sizeof r);
2197 memset(&sin, 0, sizeof sin);
2198 sin.sin_family = AF_INET;
2199 sin.sin_addr.s_addr = ip;
2201 memcpy(&r.arp_pa, &sin, sizeof sin);
2202 r.arp_ha.sa_family = ARPHRD_ETHER;
2204 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2205 COVERAGE_INC(netdev_arp_lookup);
2206 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2208 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2209 } else if (retval != ENXIO) {
2210 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2211 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2217 nd_to_iff_flags(enum netdev_flags nd)
2220 if (nd & NETDEV_UP) {
2223 if (nd & NETDEV_PROMISC) {
2230 iff_to_nd_flags(int iff)
2232 enum netdev_flags nd = 0;
2236 if (iff & IFF_PROMISC) {
2237 nd |= NETDEV_PROMISC;
2243 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2244 enum netdev_flags on, enum netdev_flags *old_flagsp)
2246 int old_flags, new_flags;
2249 error = get_flags(netdev, &old_flags);
2251 *old_flagsp = iff_to_nd_flags(old_flags);
2252 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2253 if (new_flags != old_flags) {
2254 error = set_flags(netdev, new_flags);
2261 netdev_linux_change_seq(const struct netdev *netdev)
2263 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2266 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2270 netdev_linux_init, \
2272 netdev_linux_wait, \
2275 netdev_linux_destroy, \
2276 NULL, /* get_config */ \
2277 NULL, /* set_config */ \
2279 netdev_linux_open, \
2280 netdev_linux_close, \
2284 netdev_linux_listen, \
2285 netdev_linux_recv, \
2286 netdev_linux_recv_wait, \
2287 netdev_linux_drain, \
2289 netdev_linux_send, \
2290 netdev_linux_send_wait, \
2292 netdev_linux_set_etheraddr, \
2293 netdev_linux_get_etheraddr, \
2294 netdev_linux_get_mtu, \
2295 netdev_linux_set_mtu, \
2296 netdev_linux_get_ifindex, \
2297 netdev_linux_get_carrier, \
2298 netdev_linux_set_miimon_interval, \
2299 netdev_linux_get_stats, \
2302 netdev_linux_get_features, \
2303 netdev_linux_set_advertisements, \
2304 netdev_linux_get_vlan_vid, \
2306 netdev_linux_set_policing, \
2307 netdev_linux_get_qos_types, \
2308 netdev_linux_get_qos_capabilities, \
2309 netdev_linux_get_qos, \
2310 netdev_linux_set_qos, \
2311 netdev_linux_get_queue, \
2312 netdev_linux_set_queue, \
2313 netdev_linux_delete_queue, \
2314 netdev_linux_get_queue_stats, \
2315 netdev_linux_dump_queues, \
2316 netdev_linux_dump_queue_stats, \
2318 netdev_linux_get_in4, \
2319 netdev_linux_set_in4, \
2320 netdev_linux_get_in6, \
2321 netdev_linux_add_router, \
2322 netdev_linux_get_next_hop, \
2323 netdev_linux_get_status, \
2324 netdev_linux_arp_lookup, \
2326 netdev_linux_update_flags, \
2328 netdev_linux_change_seq \
2331 const struct netdev_class netdev_linux_class =
2334 netdev_linux_create,
2335 netdev_linux_enumerate,
2336 NULL); /* set_stats */
2338 const struct netdev_class netdev_tap_class =
2341 netdev_linux_create_tap,
2342 NULL, /* enumerate */
2343 NULL); /* set_stats */
2345 const struct netdev_class netdev_internal_class =
2348 netdev_linux_create,
2349 NULL, /* enumerate */
2350 netdev_vport_set_stats);
2352 /* HTB traffic control class. */
2354 #define HTB_N_QUEUES 0xf000
2358 unsigned int max_rate; /* In bytes/s. */
2362 struct tc_queue tc_queue;
2363 unsigned int min_rate; /* In bytes/s. */
2364 unsigned int max_rate; /* In bytes/s. */
2365 unsigned int burst; /* In bytes. */
2366 unsigned int priority; /* Lower values are higher priorities. */
2370 htb_get__(const struct netdev *netdev)
2372 struct netdev_dev_linux *netdev_dev =
2373 netdev_dev_linux_cast(netdev_get_dev(netdev));
2374 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2378 htb_install__(struct netdev *netdev, uint64_t max_rate)
2380 struct netdev_dev_linux *netdev_dev =
2381 netdev_dev_linux_cast(netdev_get_dev(netdev));
2384 htb = xmalloc(sizeof *htb);
2385 tc_init(&htb->tc, &tc_ops_htb);
2386 htb->max_rate = max_rate;
2388 netdev_dev->tc = &htb->tc;
2391 /* Create an HTB qdisc.
2393 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2395 htb_setup_qdisc__(struct netdev *netdev)
2398 struct tc_htb_glob opt;
2399 struct ofpbuf request;
2400 struct tcmsg *tcmsg;
2402 tc_del_qdisc(netdev);
2404 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2405 NLM_F_EXCL | NLM_F_CREATE, &request);
2409 tcmsg->tcm_handle = tc_make_handle(1, 0);
2410 tcmsg->tcm_parent = TC_H_ROOT;
2412 nl_msg_put_string(&request, TCA_KIND, "htb");
2414 memset(&opt, 0, sizeof opt);
2415 opt.rate2quantum = 10;
2419 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2420 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2421 nl_msg_end_nested(&request, opt_offset);
2423 return tc_transact(&request, NULL);
2426 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2427 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2429 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2430 unsigned int parent, struct htb_class *class)
2433 struct tc_htb_opt opt;
2434 struct ofpbuf request;
2435 struct tcmsg *tcmsg;
2439 error = netdev_get_mtu(netdev, &mtu);
2441 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2442 netdev_get_name(netdev));
2446 memset(&opt, 0, sizeof opt);
2447 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2448 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2449 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2450 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2451 opt.prio = class->priority;
2453 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2457 tcmsg->tcm_handle = handle;
2458 tcmsg->tcm_parent = parent;
2460 nl_msg_put_string(&request, TCA_KIND, "htb");
2461 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2462 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2463 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2464 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2465 nl_msg_end_nested(&request, opt_offset);
2467 error = tc_transact(&request, NULL);
2469 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2470 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2471 netdev_get_name(netdev),
2472 tc_get_major(handle), tc_get_minor(handle),
2473 tc_get_major(parent), tc_get_minor(parent),
2474 class->min_rate, class->max_rate,
2475 class->burst, class->priority, strerror(error));
2480 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2481 * description of them into 'details'. The description complies with the
2482 * specification given in the vswitch database documentation for linux-htb
2485 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2487 static const struct nl_policy tca_htb_policy[] = {
2488 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2489 .min_len = sizeof(struct tc_htb_opt) },
2492 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2493 const struct tc_htb_opt *htb;
2495 if (!nl_parse_nested(nl_options, tca_htb_policy,
2496 attrs, ARRAY_SIZE(tca_htb_policy))) {
2497 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2501 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2502 class->min_rate = htb->rate.rate;
2503 class->max_rate = htb->ceil.rate;
2504 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2505 class->priority = htb->prio;
2510 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2511 struct htb_class *options,
2512 struct netdev_queue_stats *stats)
2514 struct nlattr *nl_options;
2515 unsigned int handle;
2518 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2519 if (!error && queue_id) {
2520 unsigned int major = tc_get_major(handle);
2521 unsigned int minor = tc_get_minor(handle);
2522 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2523 *queue_id = minor - 1;
2528 if (!error && options) {
2529 error = htb_parse_tca_options__(nl_options, options);
2535 htb_parse_qdisc_details__(struct netdev *netdev,
2536 const struct shash *details, struct htb_class *hc)
2538 const char *max_rate_s;
2540 max_rate_s = shash_find_data(details, "max-rate");
2541 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2542 if (!hc->max_rate) {
2545 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2546 hc->max_rate = netdev_features_to_bps(current) / 8;
2548 hc->min_rate = hc->max_rate;
2554 htb_parse_class_details__(struct netdev *netdev,
2555 const struct shash *details, struct htb_class *hc)
2557 const struct htb *htb = htb_get__(netdev);
2558 const char *min_rate_s = shash_find_data(details, "min-rate");
2559 const char *max_rate_s = shash_find_data(details, "max-rate");
2560 const char *burst_s = shash_find_data(details, "burst");
2561 const char *priority_s = shash_find_data(details, "priority");
2564 error = netdev_get_mtu(netdev, &mtu);
2566 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2567 netdev_get_name(netdev));
2571 /* HTB requires at least an mtu sized min-rate to send any traffic even
2572 * on uncongested links. */
2573 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2574 hc->min_rate = MAX(hc->min_rate, mtu);
2575 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2578 hc->max_rate = (max_rate_s
2579 ? strtoull(max_rate_s, NULL, 10) / 8
2581 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2582 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2586 * According to hints in the documentation that I've read, it is important
2587 * that 'burst' be at least as big as the largest frame that might be
2588 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2589 * but having it a bit too small is a problem. Since netdev_get_mtu()
2590 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2591 * the MTU. We actually add 64, instead of 14, as a guard against
2592 * additional headers get tacked on somewhere that we're not aware of. */
2593 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2594 hc->burst = MAX(hc->burst, mtu + 64);
2597 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2603 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2604 unsigned int parent, struct htb_class *options,
2605 struct netdev_queue_stats *stats)
2607 struct ofpbuf *reply;
2610 error = tc_query_class(netdev, handle, parent, &reply);
2612 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2613 ofpbuf_delete(reply);
2619 htb_tc_install(struct netdev *netdev, const struct shash *details)
2623 error = htb_setup_qdisc__(netdev);
2625 struct htb_class hc;
2627 htb_parse_qdisc_details__(netdev, details, &hc);
2628 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2629 tc_make_handle(1, 0), &hc);
2631 htb_install__(netdev, hc.max_rate);
2637 static struct htb_class *
2638 htb_class_cast__(const struct tc_queue *queue)
2640 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2644 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2645 const struct htb_class *hc)
2647 struct htb *htb = htb_get__(netdev);
2648 size_t hash = hash_int(queue_id, 0);
2649 struct tc_queue *queue;
2650 struct htb_class *hcp;
2652 queue = tc_find_queue__(netdev, queue_id, hash);
2654 hcp = htb_class_cast__(queue);
2656 hcp = xmalloc(sizeof *hcp);
2657 queue = &hcp->tc_queue;
2658 queue->queue_id = queue_id;
2659 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2662 hcp->min_rate = hc->min_rate;
2663 hcp->max_rate = hc->max_rate;
2664 hcp->burst = hc->burst;
2665 hcp->priority = hc->priority;
2669 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2672 struct nl_dump dump;
2673 struct htb_class hc;
2675 /* Get qdisc options. */
2677 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2678 htb_install__(netdev, hc.max_rate);
2681 if (!start_queue_dump(netdev, &dump)) {
2684 while (nl_dump_next(&dump, &msg)) {
2685 unsigned int queue_id;
2687 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2688 htb_update_queue__(netdev, queue_id, &hc);
2691 nl_dump_done(&dump);
2697 htb_tc_destroy(struct tc *tc)
2699 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2700 struct htb_class *hc, *next;
2702 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2703 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2711 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2713 const struct htb *htb = htb_get__(netdev);
2714 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2719 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2721 struct htb_class hc;
2724 htb_parse_qdisc_details__(netdev, details, &hc);
2725 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2726 tc_make_handle(1, 0), &hc);
2728 htb_get__(netdev)->max_rate = hc.max_rate;
2734 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2735 const struct tc_queue *queue, struct shash *details)
2737 const struct htb_class *hc = htb_class_cast__(queue);
2739 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2740 if (hc->min_rate != hc->max_rate) {
2741 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2743 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2745 shash_add(details, "priority", xasprintf("%u", hc->priority));
2751 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2752 const struct shash *details)
2754 struct htb_class hc;
2757 error = htb_parse_class_details__(netdev, details, &hc);
2762 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2763 tc_make_handle(1, 0xfffe), &hc);
2768 htb_update_queue__(netdev, queue_id, &hc);
2773 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2775 struct htb_class *hc = htb_class_cast__(queue);
2776 struct htb *htb = htb_get__(netdev);
2779 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2781 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2788 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2789 struct netdev_queue_stats *stats)
2791 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2792 tc_make_handle(1, 0xfffe), NULL, stats);
2796 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2797 const struct ofpbuf *nlmsg,
2798 netdev_dump_queue_stats_cb *cb, void *aux)
2800 struct netdev_queue_stats stats;
2801 unsigned int handle, major, minor;
2804 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2809 major = tc_get_major(handle);
2810 minor = tc_get_minor(handle);
2811 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2812 (*cb)(minor - 1, &stats, aux);
2817 static const struct tc_ops tc_ops_htb = {
2818 "htb", /* linux_name */
2819 "linux-htb", /* ovs_name */
2820 HTB_N_QUEUES, /* n_queues */
2829 htb_class_get_stats,
2830 htb_class_dump_stats
2833 /* "linux-hfsc" traffic control class. */
2835 #define HFSC_N_QUEUES 0xf000
2843 struct tc_queue tc_queue;
2848 static struct hfsc *
2849 hfsc_get__(const struct netdev *netdev)
2851 struct netdev_dev_linux *netdev_dev;
2852 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2853 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2856 static struct hfsc_class *
2857 hfsc_class_cast__(const struct tc_queue *queue)
2859 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2863 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2865 struct netdev_dev_linux * netdev_dev;
2868 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2869 hfsc = xmalloc(sizeof *hfsc);
2870 tc_init(&hfsc->tc, &tc_ops_hfsc);
2871 hfsc->max_rate = max_rate;
2872 netdev_dev->tc = &hfsc->tc;
2876 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2877 const struct hfsc_class *hc)
2881 struct hfsc_class *hcp;
2882 struct tc_queue *queue;
2884 hfsc = hfsc_get__(netdev);
2885 hash = hash_int(queue_id, 0);
2887 queue = tc_find_queue__(netdev, queue_id, hash);
2889 hcp = hfsc_class_cast__(queue);
2891 hcp = xmalloc(sizeof *hcp);
2892 queue = &hcp->tc_queue;
2893 queue->queue_id = queue_id;
2894 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2897 hcp->min_rate = hc->min_rate;
2898 hcp->max_rate = hc->max_rate;
2902 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2904 const struct tc_service_curve *rsc, *fsc, *usc;
2905 static const struct nl_policy tca_hfsc_policy[] = {
2907 .type = NL_A_UNSPEC,
2909 .min_len = sizeof(struct tc_service_curve),
2912 .type = NL_A_UNSPEC,
2914 .min_len = sizeof(struct tc_service_curve),
2917 .type = NL_A_UNSPEC,
2919 .min_len = sizeof(struct tc_service_curve),
2922 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2924 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2925 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2926 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2930 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2931 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2932 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2934 if (rsc->m1 != 0 || rsc->d != 0 ||
2935 fsc->m1 != 0 || fsc->d != 0 ||
2936 usc->m1 != 0 || usc->d != 0) {
2937 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2938 "Non-linear service curves are not supported.");
2942 if (rsc->m2 != fsc->m2) {
2943 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2944 "Real-time service curves are not supported ");
2948 if (rsc->m2 > usc->m2) {
2949 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2950 "Min-rate service curve is greater than "
2951 "the max-rate service curve.");
2955 class->min_rate = fsc->m2;
2956 class->max_rate = usc->m2;
2961 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2962 struct hfsc_class *options,
2963 struct netdev_queue_stats *stats)
2966 unsigned int handle;
2967 struct nlattr *nl_options;
2969 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2975 unsigned int major, minor;
2977 major = tc_get_major(handle);
2978 minor = tc_get_minor(handle);
2979 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2980 *queue_id = minor - 1;
2987 error = hfsc_parse_tca_options__(nl_options, options);
2994 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2995 unsigned int parent, struct hfsc_class *options,
2996 struct netdev_queue_stats *stats)
2999 struct ofpbuf *reply;
3001 error = tc_query_class(netdev, handle, parent, &reply);
3006 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3007 ofpbuf_delete(reply);
3012 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3013 struct hfsc_class *class)
3016 const char *max_rate_s;
3018 max_rate_s = shash_find_data(details, "max-rate");
3019 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3024 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3025 max_rate = netdev_features_to_bps(current) / 8;
3028 class->min_rate = max_rate;
3029 class->max_rate = max_rate;
3033 hfsc_parse_class_details__(struct netdev *netdev,
3034 const struct shash *details,
3035 struct hfsc_class * class)
3037 const struct hfsc *hfsc;
3038 uint32_t min_rate, max_rate;
3039 const char *min_rate_s, *max_rate_s;
3041 hfsc = hfsc_get__(netdev);
3042 min_rate_s = shash_find_data(details, "min-rate");
3043 max_rate_s = shash_find_data(details, "max-rate");
3045 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3046 min_rate = MAX(min_rate, 1);
3047 min_rate = MIN(min_rate, hfsc->max_rate);
3049 max_rate = (max_rate_s
3050 ? strtoull(max_rate_s, NULL, 10) / 8
3052 max_rate = MAX(max_rate, min_rate);
3053 max_rate = MIN(max_rate, hfsc->max_rate);
3055 class->min_rate = min_rate;
3056 class->max_rate = max_rate;
3061 /* Create an HFSC qdisc.
3063 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3065 hfsc_setup_qdisc__(struct netdev * netdev)
3067 struct tcmsg *tcmsg;
3068 struct ofpbuf request;
3069 struct tc_hfsc_qopt opt;
3071 tc_del_qdisc(netdev);
3073 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3074 NLM_F_EXCL | NLM_F_CREATE, &request);
3080 tcmsg->tcm_handle = tc_make_handle(1, 0);
3081 tcmsg->tcm_parent = TC_H_ROOT;
3083 memset(&opt, 0, sizeof opt);
3086 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3087 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3089 return tc_transact(&request, NULL);
3092 /* Create an HFSC class.
3094 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3095 * sc rate <min_rate> ul rate <max_rate>" */
3097 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3098 unsigned int parent, struct hfsc_class *class)
3102 struct tcmsg *tcmsg;
3103 struct ofpbuf request;
3104 struct tc_service_curve min, max;
3106 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3112 tcmsg->tcm_handle = handle;
3113 tcmsg->tcm_parent = parent;
3117 min.m2 = class->min_rate;
3121 max.m2 = class->max_rate;
3123 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3124 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3125 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3126 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3127 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3128 nl_msg_end_nested(&request, opt_offset);
3130 error = tc_transact(&request, NULL);
3132 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3133 "min-rate %ubps, max-rate %ubps (%s)",
3134 netdev_get_name(netdev),
3135 tc_get_major(handle), tc_get_minor(handle),
3136 tc_get_major(parent), tc_get_minor(parent),
3137 class->min_rate, class->max_rate, strerror(error));
3144 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3147 struct hfsc_class class;
3149 error = hfsc_setup_qdisc__(netdev);
3155 hfsc_parse_qdisc_details__(netdev, details, &class);
3156 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3157 tc_make_handle(1, 0), &class);
3163 hfsc_install__(netdev, class.max_rate);
3168 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3171 struct nl_dump dump;
3172 struct hfsc_class hc;
3175 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3176 hfsc_install__(netdev, hc.max_rate);
3178 if (!start_queue_dump(netdev, &dump)) {
3182 while (nl_dump_next(&dump, &msg)) {
3183 unsigned int queue_id;
3185 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3186 hfsc_update_queue__(netdev, queue_id, &hc);
3190 nl_dump_done(&dump);
3195 hfsc_tc_destroy(struct tc *tc)
3198 struct hfsc_class *hc, *next;
3200 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3202 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3203 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3212 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3214 const struct hfsc *hfsc;
3215 hfsc = hfsc_get__(netdev);
3216 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3221 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3224 struct hfsc_class class;
3226 hfsc_parse_qdisc_details__(netdev, details, &class);
3227 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3228 tc_make_handle(1, 0), &class);
3231 hfsc_get__(netdev)->max_rate = class.max_rate;
3238 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3239 const struct tc_queue *queue, struct shash *details)
3241 const struct hfsc_class *hc;
3243 hc = hfsc_class_cast__(queue);
3244 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3245 if (hc->min_rate != hc->max_rate) {
3246 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3252 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3253 const struct shash *details)
3256 struct hfsc_class class;
3258 error = hfsc_parse_class_details__(netdev, details, &class);
3263 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3264 tc_make_handle(1, 0xfffe), &class);
3269 hfsc_update_queue__(netdev, queue_id, &class);
3274 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3278 struct hfsc_class *hc;
3280 hc = hfsc_class_cast__(queue);
3281 hfsc = hfsc_get__(netdev);
3283 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3285 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3292 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3293 struct netdev_queue_stats *stats)
3295 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3296 tc_make_handle(1, 0xfffe), NULL, stats);
3300 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3301 const struct ofpbuf *nlmsg,
3302 netdev_dump_queue_stats_cb *cb, void *aux)
3304 struct netdev_queue_stats stats;
3305 unsigned int handle, major, minor;
3308 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3313 major = tc_get_major(handle);
3314 minor = tc_get_minor(handle);
3315 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3316 (*cb)(minor - 1, &stats, aux);
3321 static const struct tc_ops tc_ops_hfsc = {
3322 "hfsc", /* linux_name */
3323 "linux-hfsc", /* ovs_name */
3324 HFSC_N_QUEUES, /* n_queues */
3325 hfsc_tc_install, /* tc_install */
3326 hfsc_tc_load, /* tc_load */
3327 hfsc_tc_destroy, /* tc_destroy */
3328 hfsc_qdisc_get, /* qdisc_get */
3329 hfsc_qdisc_set, /* qdisc_set */
3330 hfsc_class_get, /* class_get */
3331 hfsc_class_set, /* class_set */
3332 hfsc_class_delete, /* class_delete */
3333 hfsc_class_get_stats, /* class_get_stats */
3334 hfsc_class_dump_stats /* class_dump_stats */
3337 /* "linux-default" traffic control class.
3339 * This class represents the default, unnamed Linux qdisc. It corresponds to
3340 * the "" (empty string) QoS type in the OVS database. */
3343 default_install__(struct netdev *netdev)
3345 struct netdev_dev_linux *netdev_dev =
3346 netdev_dev_linux_cast(netdev_get_dev(netdev));
3347 static struct tc *tc;
3350 tc = xmalloc(sizeof *tc);
3351 tc_init(tc, &tc_ops_default);
3353 netdev_dev->tc = tc;
3357 default_tc_install(struct netdev *netdev,
3358 const struct shash *details OVS_UNUSED)
3360 default_install__(netdev);
3365 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3367 default_install__(netdev);
3371 static const struct tc_ops tc_ops_default = {
3372 NULL, /* linux_name */
3377 NULL, /* tc_destroy */
3378 NULL, /* qdisc_get */
3379 NULL, /* qdisc_set */
3380 NULL, /* class_get */
3381 NULL, /* class_set */
3382 NULL, /* class_delete */
3383 NULL, /* class_get_stats */
3384 NULL /* class_dump_stats */
3387 /* "linux-other" traffic control class.
3392 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3394 struct netdev_dev_linux *netdev_dev =
3395 netdev_dev_linux_cast(netdev_get_dev(netdev));
3396 static struct tc *tc;
3399 tc = xmalloc(sizeof *tc);
3400 tc_init(tc, &tc_ops_other);
3402 netdev_dev->tc = tc;
3406 static const struct tc_ops tc_ops_other = {
3407 NULL, /* linux_name */
3408 "linux-other", /* ovs_name */
3410 NULL, /* tc_install */
3412 NULL, /* tc_destroy */
3413 NULL, /* qdisc_get */
3414 NULL, /* qdisc_set */
3415 NULL, /* class_get */
3416 NULL, /* class_set */
3417 NULL, /* class_delete */
3418 NULL, /* class_get_stats */
3419 NULL /* class_dump_stats */
3422 /* Traffic control. */
3424 /* Number of kernel "tc" ticks per second. */
3425 static double ticks_per_s;
3427 /* Number of kernel "jiffies" per second. This is used for the purpose of
3428 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3429 * one jiffy's worth of data.
3431 * There are two possibilities here:
3433 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3434 * approximate range of 100 to 1024. That means that we really need to
3435 * make sure that the qdisc can buffer that much data.
3437 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3438 * has finely granular timers and there's no need to fudge additional room
3439 * for buffers. (There's no extra effort needed to implement that: the
3440 * large 'buffer_hz' is used as a divisor, so practically any number will
3441 * come out as 0 in the division. Small integer results in the case of
3442 * really high dividends won't have any real effect anyhow.)
3444 static unsigned int buffer_hz;
3446 /* Returns tc handle 'major':'minor'. */
3448 tc_make_handle(unsigned int major, unsigned int minor)
3450 return TC_H_MAKE(major << 16, minor);
3453 /* Returns the major number from 'handle'. */
3455 tc_get_major(unsigned int handle)
3457 return TC_H_MAJ(handle) >> 16;
3460 /* Returns the minor number from 'handle'. */
3462 tc_get_minor(unsigned int handle)
3464 return TC_H_MIN(handle);
3467 static struct tcmsg *
3468 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3469 struct ofpbuf *request)
3471 struct tcmsg *tcmsg;
3475 error = get_ifindex(netdev, &ifindex);
3480 ofpbuf_init(request, 512);
3481 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3482 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3483 tcmsg->tcm_family = AF_UNSPEC;
3484 tcmsg->tcm_ifindex = ifindex;
3485 /* Caller should fill in tcmsg->tcm_handle. */
3486 /* Caller should fill in tcmsg->tcm_parent. */
3492 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3494 int error = nl_sock_transact(rtnl_sock, request, replyp);
3495 ofpbuf_uninit(request);
3502 /* The values in psched are not individually very meaningful, but they are
3503 * important. The tables below show some values seen in the wild.
3507 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3508 * (Before that, there are hints that it was 1000000000.)
3510 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3514 * -----------------------------------
3515 * [1] 000c8000 000f4240 000f4240 00000064
3516 * [2] 000003e8 00000400 000f4240 3b9aca00
3517 * [3] 000003e8 00000400 000f4240 3b9aca00
3518 * [4] 000003e8 00000400 000f4240 00000064
3519 * [5] 000003e8 00000040 000f4240 3b9aca00
3520 * [6] 000003e8 00000040 000f4240 000000f9
3522 * a b c d ticks_per_s buffer_hz
3523 * ------- --------- ---------- ------------- ----------- -------------
3524 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3525 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3526 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3527 * [4] 1,000 1,024 1,000,000 100 976,562 100
3528 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3529 * [6] 1,000 64 1,000,000 249 15,625,000 249
3531 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3532 * [2] 2.6.26-1-686-bigmem from Debian lenny
3533 * [3] 2.6.26-2-sparc64 from Debian lenny
3534 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3535 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3536 * [6] 2.6.34 from kernel.org on KVM
3538 static const char fn[] = "/proc/net/psched";
3539 unsigned int a, b, c, d;
3545 stream = fopen(fn, "r");
3547 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3551 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3552 VLOG_WARN("%s: read failed", fn);
3556 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3560 VLOG_WARN("%s: invalid scheduler parameters", fn);
3564 ticks_per_s = (double) a * c / b;
3568 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3571 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3574 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3575 * rate of 'rate' bytes per second. */
3577 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3582 return (rate * ticks) / ticks_per_s;
3585 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3586 * rate of 'rate' bytes per second. */
3588 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3593 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3596 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3597 * a transmission rate of 'rate' bytes per second. */
3599 tc_buffer_per_jiffy(unsigned int rate)
3604 return rate / buffer_hz;
3607 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3608 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3609 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3610 * stores NULL into it if it is absent.
3612 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3615 * Returns 0 if successful, otherwise a positive errno value. */
3617 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3618 struct nlattr **options)
3620 static const struct nl_policy tca_policy[] = {
3621 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3622 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3624 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3626 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3627 tca_policy, ta, ARRAY_SIZE(ta))) {
3628 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3633 *kind = nl_attr_get_string(ta[TCA_KIND]);
3637 *options = ta[TCA_OPTIONS];
3652 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3653 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3654 * into '*options', and its queue statistics into '*stats'. Any of the output
3655 * arguments may be null.
3657 * Returns 0 if successful, otherwise a positive errno value. */
3659 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3660 struct nlattr **options, struct netdev_queue_stats *stats)
3662 static const struct nl_policy tca_policy[] = {
3663 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3664 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3666 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3668 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3669 tca_policy, ta, ARRAY_SIZE(ta))) {
3670 VLOG_WARN_RL(&rl, "failed to parse class message");
3675 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3676 *handlep = tc->tcm_handle;
3680 *options = ta[TCA_OPTIONS];
3684 const struct gnet_stats_queue *gsq;
3685 struct gnet_stats_basic gsb;
3687 static const struct nl_policy stats_policy[] = {
3688 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3689 .min_len = sizeof gsb },
3690 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3691 .min_len = sizeof *gsq },
3693 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3695 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3696 sa, ARRAY_SIZE(sa))) {
3697 VLOG_WARN_RL(&rl, "failed to parse class stats");
3701 /* Alignment issues screw up the length of struct gnet_stats_basic on
3702 * some arch/bitsize combinations. Newer versions of Linux have a
3703 * struct gnet_stats_basic_packed, but we can't depend on that. The
3704 * easiest thing to do is just to make a copy. */
3705 memset(&gsb, 0, sizeof gsb);
3706 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3707 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3708 stats->tx_bytes = gsb.bytes;
3709 stats->tx_packets = gsb.packets;
3711 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3712 stats->tx_errors = gsq->drops;
3722 memset(stats, 0, sizeof *stats);
3727 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3730 tc_query_class(const struct netdev *netdev,
3731 unsigned int handle, unsigned int parent,
3732 struct ofpbuf **replyp)
3734 struct ofpbuf request;
3735 struct tcmsg *tcmsg;
3738 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3742 tcmsg->tcm_handle = handle;
3743 tcmsg->tcm_parent = parent;
3745 error = tc_transact(&request, replyp);
3747 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3748 netdev_get_name(netdev),
3749 tc_get_major(handle), tc_get_minor(handle),
3750 tc_get_major(parent), tc_get_minor(parent),
3756 /* Equivalent to "tc class del dev <name> handle <handle>". */
3758 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3760 struct ofpbuf request;
3761 struct tcmsg *tcmsg;
3764 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3768 tcmsg->tcm_handle = handle;
3769 tcmsg->tcm_parent = 0;
3771 error = tc_transact(&request, NULL);
3773 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3774 netdev_get_name(netdev),
3775 tc_get_major(handle), tc_get_minor(handle),
3781 /* Equivalent to "tc qdisc del dev <name> root". */
3783 tc_del_qdisc(struct netdev *netdev)
3785 struct netdev_dev_linux *netdev_dev =
3786 netdev_dev_linux_cast(netdev_get_dev(netdev));
3787 struct ofpbuf request;
3788 struct tcmsg *tcmsg;
3791 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3795 tcmsg->tcm_handle = tc_make_handle(1, 0);
3796 tcmsg->tcm_parent = TC_H_ROOT;
3798 error = tc_transact(&request, NULL);
3799 if (error == EINVAL) {
3800 /* EINVAL probably means that the default qdisc was in use, in which
3801 * case we've accomplished our purpose. */
3804 if (!error && netdev_dev->tc) {
3805 if (netdev_dev->tc->ops->tc_destroy) {
3806 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3808 netdev_dev->tc = NULL;
3813 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3814 * kernel to determine what they are. Returns 0 if successful, otherwise a
3815 * positive errno value. */
3817 tc_query_qdisc(const struct netdev *netdev)
3819 struct netdev_dev_linux *netdev_dev =
3820 netdev_dev_linux_cast(netdev_get_dev(netdev));
3821 struct ofpbuf request, *qdisc;
3822 const struct tc_ops *ops;
3823 struct tcmsg *tcmsg;
3827 if (netdev_dev->tc) {
3831 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3832 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3833 * 2.6.35 without that fix backported to it.
3835 * To avoid the OOPS, we must not make a request that would attempt to dump
3836 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3837 * few others. There are a few ways that I can see to do this, but most of
3838 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3839 * technique chosen here is to assume that any non-default qdisc that we
3840 * create will have a class with handle 1:0. The built-in qdiscs only have
3841 * a class with handle 0:0.
3843 * We could check for Linux 2.6.35+ and use a more straightforward method
3845 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3849 tcmsg->tcm_handle = tc_make_handle(1, 0);
3850 tcmsg->tcm_parent = 0;
3852 /* Figure out what tc class to instantiate. */
3853 error = tc_transact(&request, &qdisc);
3857 error = tc_parse_qdisc(qdisc, &kind, NULL);
3859 ops = &tc_ops_other;
3861 ops = tc_lookup_linux_name(kind);
3863 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3864 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3866 ops = &tc_ops_other;
3869 } else if (error == ENOENT) {
3870 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3871 * other entity that doesn't have a handle 1:0. We will assume
3872 * that it's the system default qdisc. */
3873 ops = &tc_ops_default;
3876 /* Who knows? Maybe the device got deleted. */
3877 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3878 netdev_get_name(netdev), strerror(error));
3879 ops = &tc_ops_other;
3882 /* Instantiate it. */
3883 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3884 assert((load_error == 0) == (netdev_dev->tc != NULL));
3885 ofpbuf_delete(qdisc);
3887 return error ? error : load_error;
3890 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3891 approximate the time to transmit packets of various lengths. For an MTU of
3892 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3893 represents two possible packet lengths; for a MTU of 513 through 1024, four
3894 possible lengths; and so on.
3896 Returns, for the specified 'mtu', the number of bits that packet lengths
3897 need to be shifted right to fit within such a 256-entry table. */
3899 tc_calc_cell_log(unsigned int mtu)
3904 mtu = ETH_PAYLOAD_MAX;
3906 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3908 for (cell_log = 0; mtu >= 256; cell_log++) {
3915 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3918 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3920 memset(rate, 0, sizeof *rate);
3921 rate->cell_log = tc_calc_cell_log(mtu);
3922 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3923 /* rate->cell_align = 0; */ /* distro headers. */
3924 rate->mpu = ETH_TOTAL_MIN;
3928 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3929 * attribute of the specified "type".
3931 * See tc_calc_cell_log() above for a description of "rtab"s. */
3933 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3938 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3939 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3940 unsigned packet_size = (i + 1) << rate->cell_log;
3941 if (packet_size < rate->mpu) {
3942 packet_size = rate->mpu;
3944 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3948 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3949 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3950 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3953 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3955 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3956 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3959 /* Public utility functions. */
3961 #define COPY_NETDEV_STATS \
3962 dst->rx_packets = src->rx_packets; \
3963 dst->tx_packets = src->tx_packets; \
3964 dst->rx_bytes = src->rx_bytes; \
3965 dst->tx_bytes = src->tx_bytes; \
3966 dst->rx_errors = src->rx_errors; \
3967 dst->tx_errors = src->tx_errors; \
3968 dst->rx_dropped = src->rx_dropped; \
3969 dst->tx_dropped = src->tx_dropped; \
3970 dst->multicast = src->multicast; \
3971 dst->collisions = src->collisions; \
3972 dst->rx_length_errors = src->rx_length_errors; \
3973 dst->rx_over_errors = src->rx_over_errors; \
3974 dst->rx_crc_errors = src->rx_crc_errors; \
3975 dst->rx_frame_errors = src->rx_frame_errors; \
3976 dst->rx_fifo_errors = src->rx_fifo_errors; \
3977 dst->rx_missed_errors = src->rx_missed_errors; \
3978 dst->tx_aborted_errors = src->tx_aborted_errors; \
3979 dst->tx_carrier_errors = src->tx_carrier_errors; \
3980 dst->tx_fifo_errors = src->tx_fifo_errors; \
3981 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3982 dst->tx_window_errors = src->tx_window_errors
3984 /* Copies 'src' into 'dst', performing format conversion in the process. */
3986 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3987 const struct rtnl_link_stats *src)
3992 /* Copies 'src' into 'dst', performing format conversion in the process. */
3994 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3995 const struct rtnl_link_stats64 *src)
4000 /* Copies 'src' into 'dst', performing format conversion in the process. */
4002 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
4003 const struct netdev_stats *src)
4006 dst->rx_compressed = 0;
4007 dst->tx_compressed = 0;
4010 /* Utility functions. */
4013 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4015 /* Policy for RTNLGRP_LINK messages.
4017 * There are *many* more fields in these messages, but currently we only
4018 * care about these fields. */
4019 static const struct nl_policy rtnlgrp_link_policy[] = {
4020 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4021 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4022 .min_len = sizeof(struct rtnl_link_stats) },
4025 struct ofpbuf request;
4026 struct ofpbuf *reply;
4027 struct ifinfomsg *ifi;
4028 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4031 ofpbuf_init(&request, 0);
4032 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4033 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4034 ifi->ifi_family = PF_UNSPEC;
4035 ifi->ifi_index = ifindex;
4036 error = nl_sock_transact(rtnl_sock, &request, &reply);
4037 ofpbuf_uninit(&request);
4042 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4043 rtnlgrp_link_policy,
4044 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4045 ofpbuf_delete(reply);
4049 if (!attrs[IFLA_STATS]) {
4050 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4051 ofpbuf_delete(reply);
4055 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4057 ofpbuf_delete(reply);
4063 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4065 static const char fn[] = "/proc/net/dev";
4070 stream = fopen(fn, "r");
4072 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4077 while (fgets(line, sizeof line, stream)) {
4080 #define X64 "%"SCNu64
4083 X64 X64 X64 X64 X64 X64 X64 "%*u"
4084 X64 X64 X64 X64 X64 X64 X64 "%*u",
4090 &stats->rx_fifo_errors,
4091 &stats->rx_frame_errors,
4097 &stats->tx_fifo_errors,
4099 &stats->tx_carrier_errors) != 15) {
4100 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4101 } else if (!strcmp(devname, netdev_name)) {
4102 stats->rx_length_errors = UINT64_MAX;
4103 stats->rx_over_errors = UINT64_MAX;
4104 stats->rx_crc_errors = UINT64_MAX;
4105 stats->rx_missed_errors = UINT64_MAX;
4106 stats->tx_aborted_errors = UINT64_MAX;
4107 stats->tx_heartbeat_errors = UINT64_MAX;
4108 stats->tx_window_errors = UINT64_MAX;
4114 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4120 get_flags(const struct netdev *netdev, int *flags)
4125 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4127 *flags = ifr.ifr_flags;
4132 set_flags(struct netdev *netdev, int flags)
4136 ifr.ifr_flags = flags;
4137 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4142 do_get_ifindex(const char *netdev_name)
4146 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4147 COVERAGE_INC(netdev_get_ifindex);
4148 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4149 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4150 netdev_name, strerror(errno));
4153 return ifr.ifr_ifindex;
4157 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4159 struct netdev_dev_linux *netdev_dev =
4160 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4162 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4163 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4167 netdev_dev->cache_valid |= VALID_IFINDEX;
4168 netdev_dev->ifindex = ifindex;
4170 *ifindexp = netdev_dev->ifindex;
4175 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4180 memset(&ifr, 0, sizeof ifr);
4181 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4182 COVERAGE_INC(netdev_get_hwaddr);
4183 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4184 /* ENODEV probably means that a vif disappeared asynchronously and
4185 * hasn't been removed from the database yet, so reduce the log level
4186 * to INFO for that case. */
4187 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4188 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4189 netdev_name, strerror(errno));
4192 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4193 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4194 VLOG_WARN("%s device has unknown hardware address family %d",
4195 netdev_name, hwaddr_family);
4197 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4202 set_etheraddr(const char *netdev_name, int hwaddr_family,
4203 const uint8_t mac[ETH_ADDR_LEN])
4207 memset(&ifr, 0, sizeof ifr);
4208 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4209 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4210 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4211 COVERAGE_INC(netdev_set_hwaddr);
4212 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4213 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4214 netdev_name, strerror(errno));
4221 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4222 int cmd, const char *cmd_name)
4226 memset(&ifr, 0, sizeof ifr);
4227 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4228 ifr.ifr_data = (caddr_t) ecmd;
4231 COVERAGE_INC(netdev_ethtool);
4232 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4235 if (errno != EOPNOTSUPP) {
4236 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4237 "failed: %s", cmd_name, name, strerror(errno));
4239 /* The device doesn't support this operation. That's pretty
4240 * common, so there's no point in logging anything. */
4246 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4247 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4249 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4250 const char *flag_name, bool enable)
4252 const char *netdev_name = netdev_get_name(netdev);
4253 struct ethtool_value evalue;
4257 memset(&evalue, 0, sizeof evalue);
4258 error = netdev_linux_do_ethtool(netdev_name,
4259 (struct ethtool_cmd *)&evalue,
4260 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4265 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4266 error = netdev_linux_do_ethtool(netdev_name,
4267 (struct ethtool_cmd *)&evalue,
4268 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4273 memset(&evalue, 0, sizeof evalue);
4274 error = netdev_linux_do_ethtool(netdev_name,
4275 (struct ethtool_cmd *)&evalue,
4276 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4281 if (new_flags != evalue.data) {
4282 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4283 "device %s failed", enable ? "enable" : "disable",
4284 flag_name, netdev_name);
4292 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4293 const char *cmd_name)
4295 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4296 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4297 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4305 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4306 int cmd, const char *cmd_name)
4311 ifr.ifr_addr.sa_family = AF_INET;
4312 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4314 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4315 *ip = sin->sin_addr;
4320 /* Returns an AF_PACKET raw socket or a negative errno value. */
4322 af_packet_sock(void)
4324 static int sock = INT_MIN;
4326 if (sock == INT_MIN) {
4327 sock = socket(AF_PACKET, SOCK_RAW, 0);
4329 set_nonblocking(sock);
4332 VLOG_ERR("failed to create packet socket: %s", strerror(errno));