2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/mii.h>
29 #include <linux/pkt_sched.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/sockios.h>
32 #include <linux/version.h>
33 #include <sys/types.h>
34 #include <sys/ioctl.h>
35 #include <sys/socket.h>
36 #include <netpacket/packet.h>
37 #include <net/ethernet.h>
39 #include <linux/if_tunnel.h>
40 #include <net/if_arp.h>
41 #include <net/if_packet.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
50 #include "dpif-linux.h"
51 #include "dynamic-string.h"
52 #include "fatal-signal.h"
55 #include "netdev-provider.h"
56 #include "netdev-vport.h"
58 #include "netlink-socket.h"
60 #include "openflow/openflow.h"
62 #include "poll-loop.h"
63 #include "rtnetlink.h"
64 #include "rtnetlink-link.h"
65 #include "socket-util.h"
70 VLOG_DEFINE_THIS_MODULE(netdev_linux);
72 COVERAGE_DEFINE(netdev_get_vlan_vid);
73 COVERAGE_DEFINE(netdev_set_policing);
74 COVERAGE_DEFINE(netdev_arp_lookup);
75 COVERAGE_DEFINE(netdev_get_ifindex);
76 COVERAGE_DEFINE(netdev_get_hwaddr);
77 COVERAGE_DEFINE(netdev_set_hwaddr);
78 COVERAGE_DEFINE(netdev_ethtool);
80 /* These were introduced in Linux 2.6.14, so they might be missing if we have
82 #ifndef ADVERTISED_Pause
83 #define ADVERTISED_Pause (1 << 13)
85 #ifndef ADVERTISED_Asym_Pause
86 #define ADVERTISED_Asym_Pause (1 << 14)
89 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
92 #define TC_RTAB_SIZE 1024
95 static struct rtnetlink_notifier netdev_linux_cache_notifier;
96 static int cache_notifier_refcount;
99 VALID_IFINDEX = 1 << 0,
100 VALID_ETHERADDR = 1 << 1,
104 VALID_CARRIER = 1 << 5,
105 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
106 VALID_POLICING = 1 << 7,
107 VALID_HAVE_VPORT_STATS = 1 << 8
115 /* Traffic control. */
117 /* An instance of a traffic control class. Always associated with a particular
120 * Each TC implementation subclasses this with whatever additional data it
123 const struct tc_ops *ops;
124 struct hmap queues; /* Contains "struct tc_queue"s.
125 * Read by generic TC layer.
126 * Written only by TC implementation. */
129 /* One traffic control queue.
131 * Each TC implementation subclasses this with whatever additional data it
134 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
135 unsigned int queue_id; /* OpenFlow queue ID. */
138 /* A particular kind of traffic control. Each implementation generally maps to
139 * one particular Linux qdisc class.
141 * The functions below return 0 if successful or a positive errno value on
142 * failure, except where otherwise noted. All of them must be provided, except
143 * where otherwise noted. */
145 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
146 * This is null for tc_ops_default and tc_ops_other, for which there are no
147 * appropriate values. */
148 const char *linux_name;
150 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
151 const char *ovs_name;
153 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
154 * queues. The queues are numbered 0 through n_queues - 1. */
155 unsigned int n_queues;
157 /* Called to install this TC class on 'netdev'. The implementation should
158 * make the Netlink calls required to set up 'netdev' with the right qdisc
159 * and configure it according to 'details'. The implementation may assume
160 * that the current qdisc is the default; that is, there is no need for it
161 * to delete the current qdisc before installing itself.
163 * The contents of 'details' should be documented as valid for 'ovs_name'
164 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
165 * (which is built as ovs-vswitchd.conf.db(8)).
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'.
170 * (This function is null for tc_ops_other, which cannot be installed. For
171 * other TC classes it should always be nonnull.) */
172 int (*tc_install)(struct netdev *netdev, const struct shash *details);
174 /* Called when the netdev code determines (through a Netlink query) that
175 * this TC class's qdisc is installed on 'netdev', but we didn't install
176 * it ourselves and so don't know any of the details.
178 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
179 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
180 * implementation should parse the other attributes of 'nlmsg' as
181 * necessary to determine its configuration. If necessary it should also
182 * use Netlink queries to determine the configuration of queues on
185 * This function must return 0 if and only if it sets 'netdev->tc' to an
186 * initialized 'struct tc'. */
187 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
189 /* Destroys the data structures allocated by the implementation as part of
190 * 'tc'. (This includes destroying 'tc->queues' by calling
193 * The implementation should not need to perform any Netlink calls. If
194 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
195 * (But it may not be desirable.)
197 * This function may be null if 'tc' is trivial. */
198 void (*tc_destroy)(struct tc *tc);
200 /* Retrieves details of 'netdev->tc' configuration into 'details'.
202 * The implementation should not need to perform any Netlink calls, because
203 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
204 * cached the configuration.
206 * The contents of 'details' should be documented as valid for 'ovs_name'
207 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
208 * (which is built as ovs-vswitchd.conf.db(8)).
210 * This function may be null if 'tc' is not configurable.
212 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
214 /* Reconfigures 'netdev->tc' according to 'details', performing any
215 * required Netlink calls to complete the reconfiguration.
217 * The contents of 'details' should be documented as valid for 'ovs_name'
218 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
219 * (which is built as ovs-vswitchd.conf.db(8)).
221 * This function may be null if 'tc' is not configurable.
223 int (*qdisc_set)(struct netdev *, const struct shash *details);
225 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
226 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
232 * The implementation should not need to perform any Netlink calls, because
233 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
234 * cached the queue configuration.
236 * This function may be null if 'tc' does not have queues ('n_queues' is
238 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
239 struct shash *details);
241 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
242 * 'details', perfoming any required Netlink calls to complete the
243 * reconfiguration. The caller ensures that 'queue_id' is less than
246 * The contents of 'details' should be documented as valid for 'ovs_name'
247 * in the "other_config" column in the "Queue" table in
248 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
250 * This function may be null if 'tc' does not have queues or its queues are
251 * not configurable. */
252 int (*class_set)(struct netdev *, unsigned int queue_id,
253 const struct shash *details);
255 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
256 * tc_queue's within 'netdev->tc->queues'.
258 * This function may be null if 'tc' does not have queues or its queues
259 * cannot be deleted. */
260 int (*class_delete)(struct netdev *, struct tc_queue *queue);
262 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
263 * 'struct tc_queue's within 'netdev->tc->queues'.
265 * On success, initializes '*stats'.
267 * This function may be null if 'tc' does not have queues or if it cannot
268 * report queue statistics. */
269 int (*class_get_stats)(const struct netdev *netdev,
270 const struct tc_queue *queue,
271 struct netdev_queue_stats *stats);
273 /* Extracts queue stats from 'nlmsg', which is a response to a
274 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_dump_stats)(const struct netdev *netdev,
279 const struct ofpbuf *nlmsg,
280 netdev_dump_queue_stats_cb *cb, void *aux);
284 tc_init(struct tc *tc, const struct tc_ops *ops)
287 hmap_init(&tc->queues);
291 tc_destroy(struct tc *tc)
293 hmap_destroy(&tc->queues);
296 static const struct tc_ops tc_ops_htb;
297 static const struct tc_ops tc_ops_hfsc;
298 static const struct tc_ops tc_ops_default;
299 static const struct tc_ops tc_ops_other;
301 static const struct tc_ops *tcs[] = {
302 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
303 &tc_ops_hfsc, /* Hierarchical fair service curve. */
304 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
305 &tc_ops_other, /* Some other qdisc. */
309 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
310 static unsigned int tc_get_major(unsigned int handle);
311 static unsigned int tc_get_minor(unsigned int handle);
313 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
314 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
315 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
317 static struct tcmsg *tc_make_request(const struct netdev *, int type,
318 unsigned int flags, struct ofpbuf *);
319 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
321 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
322 struct nlattr **options);
323 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
324 struct nlattr **options,
325 struct netdev_queue_stats *);
326 static int tc_query_class(const struct netdev *,
327 unsigned int handle, unsigned int parent,
328 struct ofpbuf **replyp);
329 static int tc_delete_class(const struct netdev *, unsigned int handle);
331 static int tc_del_qdisc(struct netdev *netdev);
332 static int tc_query_qdisc(const struct netdev *netdev);
334 static int tc_calc_cell_log(unsigned int mtu);
335 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
336 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
337 const struct tc_ratespec *rate);
338 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
340 struct netdev_dev_linux {
341 struct netdev_dev netdev_dev;
343 struct shash_node *shash_node;
344 unsigned int cache_valid;
346 /* The following are figured out "on demand" only. They are only valid
347 * when the corresponding VALID_* bit in 'cache_valid' is set. */
349 uint8_t etheraddr[ETH_ADDR_LEN];
350 struct in_addr address, netmask;
354 bool is_internal; /* Is this an openvswitch internal device? */
355 bool is_tap; /* Is this a tuntap device? */
356 uint32_t kbits_rate; /* Policing data. */
357 uint32_t kbits_burst;
358 bool have_vport_stats;
362 struct tap_state tap;
366 struct netdev_linux {
367 struct netdev netdev;
371 /* Sockets used for ioctl operations. */
372 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
373 static int af_packet_sock = -1; /* AF_PACKET, SOCK_RAW. */
375 /* A Netlink routing socket that is not subscribed to any multicast groups. */
376 static struct nl_sock *rtnl_sock;
378 struct netdev_linux_notifier {
379 struct netdev_notifier notifier;
383 static struct shash netdev_linux_notifiers =
384 SHASH_INITIALIZER(&netdev_linux_notifiers);
385 static struct rtnetlink_notifier netdev_linux_poll_notifier;
387 /* This is set pretty low because we probably won't learn anything from the
388 * additional log messages. */
389 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
391 static int netdev_linux_init(void);
393 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
394 int cmd, const char *cmd_name);
395 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
396 const char *cmd_name);
397 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
398 int cmd, const char *cmd_name);
399 static int get_flags(const struct netdev *, int *flagsp);
400 static int set_flags(struct netdev *, int flags);
401 static int do_get_ifindex(const char *netdev_name);
402 static int get_ifindex(const struct netdev *, int *ifindexp);
403 static int do_set_addr(struct netdev *netdev,
404 int ioctl_nr, const char *ioctl_name,
405 struct in_addr addr);
406 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
407 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
408 const uint8_t[ETH_ADDR_LEN]);
409 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
410 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
413 is_netdev_linux_class(const struct netdev_class *netdev_class)
415 return netdev_class->init == netdev_linux_init;
418 static struct netdev_dev_linux *
419 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
421 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
422 assert(is_netdev_linux_class(netdev_class));
424 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
427 static struct netdev_linux *
428 netdev_linux_cast(const struct netdev *netdev)
430 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
431 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
432 assert(is_netdev_linux_class(netdev_class));
434 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
438 netdev_linux_init(void)
440 static int status = -1;
442 /* Create AF_INET socket. */
443 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
444 status = af_inet_sock >= 0 ? 0 : errno;
446 VLOG_ERR("failed to create inet socket: %s", strerror(status));
448 /* Create AF_PACKET socket. */
449 af_packet_sock = socket(AF_PACKET, SOCK_RAW, 0);
450 status = af_packet_sock >= 0 ? 0 : errno;
452 VLOG_ERR("failed to create packet socket: %s",
455 set_nonblocking(af_packet_sock);
458 /* Create rtnetlink socket. */
460 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
462 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
471 netdev_linux_run(void)
473 rtnetlink_link_notifier_run();
477 netdev_linux_wait(void)
479 rtnetlink_link_notifier_wait();
483 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
484 void *aux OVS_UNUSED)
486 struct netdev_dev_linux *dev;
488 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
490 const struct netdev_class *netdev_class =
491 netdev_dev_get_class(base_dev);
493 if (is_netdev_linux_class(netdev_class)) {
494 dev = netdev_dev_linux_cast(base_dev);
495 dev->cache_valid = 0;
499 struct shash device_shash;
500 struct shash_node *node;
502 shash_init(&device_shash);
503 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
504 SHASH_FOR_EACH (node, &device_shash) {
506 dev->cache_valid = 0;
508 shash_destroy(&device_shash);
512 /* Creates system and internal devices. */
514 netdev_linux_create(const struct netdev_class *class,
515 const char *name, const struct shash *args,
516 struct netdev_dev **netdev_devp)
518 struct netdev_dev_linux *netdev_dev;
521 if (!shash_is_empty(args)) {
522 VLOG_WARN("%s: arguments for %s devices should be empty",
526 if (!cache_notifier_refcount) {
527 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
528 netdev_linux_cache_cb, NULL);
533 cache_notifier_refcount++;
535 netdev_dev = xzalloc(sizeof *netdev_dev);
536 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
538 *netdev_devp = &netdev_dev->netdev_dev;
542 /* For most types of netdevs we open the device for each call of
543 * netdev_open(). However, this is not the case with tap devices,
544 * since it is only possible to open the device once. In this
545 * situation we share a single file descriptor, and consequently
546 * buffers, across all readers. Therefore once data is read it will
547 * be unavailable to other reads for tap devices. */
549 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
550 const char *name, const struct shash *args,
551 struct netdev_dev **netdev_devp)
553 struct netdev_dev_linux *netdev_dev;
554 struct tap_state *state;
555 static const char tap_dev[] = "/dev/net/tun";
559 if (!shash_is_empty(args)) {
560 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
563 netdev_dev = xzalloc(sizeof *netdev_dev);
564 state = &netdev_dev->state.tap;
566 /* Open tap device. */
567 state->fd = open(tap_dev, O_RDWR);
570 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
574 /* Create tap device. */
575 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
576 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
577 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
578 VLOG_WARN("%s: creating tap device failed: %s", name,
584 /* Make non-blocking. */
585 error = set_nonblocking(state->fd);
590 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
591 *netdev_devp = &netdev_dev->netdev_dev;
600 destroy_tap(struct netdev_dev_linux *netdev_dev)
602 struct tap_state *state = &netdev_dev->state.tap;
604 if (state->fd >= 0) {
609 /* Destroys the netdev device 'netdev_dev_'. */
611 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
613 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
614 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
616 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
617 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
620 if (class == &netdev_linux_class || class == &netdev_internal_class) {
621 cache_notifier_refcount--;
623 if (!cache_notifier_refcount) {
624 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
626 } else if (class == &netdev_tap_class) {
627 destroy_tap(netdev_dev);
636 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
637 struct netdev **netdevp)
639 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
640 struct netdev_linux *netdev;
641 enum netdev_flags flags;
644 /* Allocate network device. */
645 netdev = xzalloc(sizeof *netdev);
647 netdev_init(&netdev->netdev, netdev_dev_);
649 /* Verify that the device really exists, by attempting to read its flags.
650 * (The flags might be cached, in which case this won't actually do an
653 * Don't do this for "internal" netdevs, though, because those have to be
654 * created as netdev objects before they exist in the kernel, because
655 * creating them in the kernel happens by passing a netdev object to
656 * dpif_port_add(). */
657 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
658 error = netdev_get_flags(&netdev->netdev, &flags);
659 if (error == ENODEV) {
664 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
665 !netdev_dev->state.tap.opened) {
667 /* We assume that the first user of the tap device is the primary user
668 * and give them the tap FD. Subsequent users probably just expect
669 * this to be a system device so open it normally to avoid send/receive
670 * directions appearing to be reversed. */
671 netdev->fd = netdev_dev->state.tap.fd;
672 netdev_dev->state.tap.opened = true;
673 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
674 struct sockaddr_ll sll;
678 /* Create file descriptor. */
679 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
680 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
682 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
683 if (netdev->fd < 0) {
688 /* Set non-blocking mode. */
689 error = set_nonblocking(netdev->fd);
694 /* Get ethernet device index. */
695 error = get_ifindex(&netdev->netdev, &ifindex);
700 /* Bind to specific ethernet device. */
701 memset(&sll, 0, sizeof sll);
702 sll.sll_family = AF_PACKET;
703 sll.sll_ifindex = ifindex;
705 (struct sockaddr *) &sll, sizeof sll) < 0) {
707 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
712 /* Between the socket() and bind() calls above, the socket receives all
713 * packets of the requested type on all system interfaces. We do not
714 * want to receive that data, but there is no way to avoid it. So we
715 * must now drain out the receive queue. */
716 error = drain_rcvbuf(netdev->fd);
722 *netdevp = &netdev->netdev;
726 netdev_uninit(&netdev->netdev, true);
730 /* Closes and destroys 'netdev'. */
732 netdev_linux_close(struct netdev *netdev_)
734 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
736 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
742 /* Initializes 'sset' with a list of the names of all known network devices. */
744 netdev_linux_enumerate(struct sset *sset)
746 struct if_nameindex *names;
748 names = if_nameindex();
752 for (i = 0; names[i].if_name != NULL; i++) {
753 sset_add(sset, names[i].if_name);
755 if_freenameindex(names);
758 VLOG_WARN("could not obtain list of network device names: %s",
765 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
767 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
769 if (netdev->fd < 0) {
770 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
775 ssize_t retval = read(netdev->fd, data, size);
778 } else if (errno != EINTR) {
779 if (errno != EAGAIN) {
780 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
781 strerror(errno), netdev_get_name(netdev_));
788 /* Registers with the poll loop to wake up from the next call to poll_block()
789 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
791 netdev_linux_recv_wait(struct netdev *netdev_)
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 if (netdev->fd >= 0) {
795 poll_fd_wait(netdev->fd, POLLIN);
799 /* Discards all packets waiting to be received from 'netdev'. */
801 netdev_linux_drain(struct netdev *netdev_)
803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
804 if (netdev->fd < 0) {
806 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
808 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
809 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
813 drain_fd(netdev->fd, ifr.ifr_qlen);
816 return drain_rcvbuf(netdev->fd);
820 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
821 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
822 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
823 * the packet is too big or too small to transmit on the device.
825 * The caller retains ownership of 'buffer' in all cases.
827 * The kernel maintains a packet transmission queue, so the caller is not
828 * expected to do additional queuing of packets. */
830 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
836 if (netdev->fd < 0) {
837 /* Use our AF_PACKET socket to send to this device. */
838 struct sockaddr_ll sll;
844 error = get_ifindex(netdev_, &ifindex);
849 /* We don't bother setting most fields in sockaddr_ll because the
850 * kernel ignores them for SOCK_RAW. */
851 memset(&sll, 0, sizeof sll);
852 sll.sll_family = AF_PACKET;
853 sll.sll_ifindex = ifindex;
855 iov.iov_base = (void *) data;
859 msg.msg_namelen = sizeof sll;
862 msg.msg_control = NULL;
863 msg.msg_controllen = 0;
866 retval = sendmsg(af_packet_sock, &msg, 0);
868 /* Use the netdev's own fd to send to this device. This is
869 * essential for tap devices, because packets sent to a tap device
870 * with an AF_PACKET socket will loop back to be *received* again
871 * on the tap device. */
872 retval = write(netdev->fd, data, size);
876 /* The Linux AF_PACKET implementation never blocks waiting for room
877 * for packets, instead returning ENOBUFS. Translate this into
878 * EAGAIN for the caller. */
879 if (errno == ENOBUFS) {
881 } else if (errno == EINTR) {
883 } else if (errno != EAGAIN) {
884 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
885 netdev_get_name(netdev_), strerror(errno));
888 } else if (retval != size) {
889 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
890 "%zu) on %s", retval, size, netdev_get_name(netdev_));
898 /* Registers with the poll loop to wake up from the next call to poll_block()
899 * when the packet transmission queue has sufficient room to transmit a packet
900 * with netdev_send().
902 * The kernel maintains a packet transmission queue, so the client is not
903 * expected to do additional queuing of packets. Thus, this function is
904 * unlikely to ever be used. It is included for completeness. */
906 netdev_linux_send_wait(struct netdev *netdev_)
908 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
909 if (netdev->fd < 0) {
911 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
912 poll_fd_wait(netdev->fd, POLLOUT);
914 /* TAP device always accepts packets.*/
915 poll_immediate_wake();
919 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
920 * otherwise a positive errno value. */
922 netdev_linux_set_etheraddr(struct netdev *netdev_,
923 const uint8_t mac[ETH_ADDR_LEN])
925 struct netdev_dev_linux *netdev_dev =
926 netdev_dev_linux_cast(netdev_get_dev(netdev_));
929 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
930 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
931 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
933 netdev_dev->cache_valid |= VALID_ETHERADDR;
934 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
942 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
943 * free the returned buffer. */
945 netdev_linux_get_etheraddr(const struct netdev *netdev_,
946 uint8_t mac[ETH_ADDR_LEN])
948 struct netdev_dev_linux *netdev_dev =
949 netdev_dev_linux_cast(netdev_get_dev(netdev_));
950 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
951 int error = get_etheraddr(netdev_get_name(netdev_),
952 netdev_dev->etheraddr);
956 netdev_dev->cache_valid |= VALID_ETHERADDR;
958 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
962 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
963 * in bytes, not including the hardware header; thus, this is typically 1500
964 * bytes for Ethernet devices. */
966 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
968 struct netdev_dev_linux *netdev_dev =
969 netdev_dev_linux_cast(netdev_get_dev(netdev_));
970 if (!(netdev_dev->cache_valid & VALID_MTU)) {
974 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
975 SIOCGIFMTU, "SIOCGIFMTU");
979 netdev_dev->mtu = ifr.ifr_mtu;
980 netdev_dev->cache_valid |= VALID_MTU;
982 *mtup = netdev_dev->mtu;
986 /* Returns the ifindex of 'netdev', if successful, as a positive number.
987 * On failure, returns a negative errno value. */
989 netdev_linux_get_ifindex(const struct netdev *netdev)
993 error = get_ifindex(netdev, &ifindex);
994 return error ? -error : ifindex;
998 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1000 struct netdev_dev_linux *netdev_dev =
1001 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1006 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1010 fn = xasprintf("/sys/class/net/%s/carrier",
1011 netdev_get_name(netdev_));
1012 fd = open(fn, O_RDONLY);
1015 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1019 retval = read(fd, line, sizeof line);
1022 if (error == EINVAL) {
1023 /* This is the normal return value when we try to check carrier
1024 * if the network device is not up. */
1026 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1029 } else if (retval == 0) {
1031 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1035 if (line[0] != '0' && line[0] != '1') {
1037 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1041 netdev_dev->carrier = line[0] != '0';
1042 netdev_dev->cache_valid |= VALID_CARRIER;
1044 *carrier = netdev_dev->carrier;
1056 netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1057 const char *cmd_name, struct mii_ioctl_data *data)
1062 memset(&ifr, 0, sizeof ifr);
1063 memcpy(&ifr.ifr_data, data, sizeof *data);
1064 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1065 &ifr, cmd, cmd_name);
1066 memcpy(data, &ifr.ifr_data, sizeof *data);
1072 netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1074 const char *name = netdev_get_name(netdev);
1075 struct mii_ioctl_data data;
1080 memset(&data, 0, sizeof data);
1081 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1083 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1084 data.reg_num = MII_BMSR;
1085 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1089 *miimon = !!(data.val_out & BMSR_LSTATUS);
1091 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1094 struct ethtool_cmd ecmd;
1096 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1099 memset(&ecmd, 0, sizeof ecmd);
1100 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1103 struct ethtool_value eval;
1105 memcpy(&eval, &ecmd, sizeof eval);
1106 *miimon = !!eval.data;
1108 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1115 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1116 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1119 check_for_working_netlink_stats(void)
1121 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1122 * preferable, so if that works, we'll use it. */
1123 int ifindex = do_get_ifindex("lo");
1125 VLOG_WARN("failed to get ifindex for lo, "
1126 "obtaining netdev stats from proc");
1129 struct netdev_stats stats;
1130 int error = get_stats_via_netlink(ifindex, &stats);
1132 VLOG_DBG("obtaining netdev stats via rtnetlink");
1135 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1136 "via proc (you are probably running a pre-2.6.19 "
1137 "kernel)", strerror(error));
1143 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1145 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1147 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1148 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1149 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1151 netdev_dev->is_tap = !strcmp(type, "tap");
1152 netdev_dev->is_internal = (!netdev_dev->is_tap
1153 && dpif_linux_is_internal_device(name));
1154 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1159 swap_uint64(uint64_t *a, uint64_t *b)
1166 /* Retrieves current device stats for 'netdev'. */
1168 netdev_linux_get_stats(const struct netdev *netdev_,
1169 struct netdev_stats *stats)
1171 struct netdev_dev_linux *netdev_dev =
1172 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1173 static int use_netlink_stats = -1;
1176 if (netdev_dev->have_vport_stats ||
1177 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1179 error = netdev_vport_get_stats(netdev_, stats);
1180 netdev_dev->have_vport_stats = !error;
1181 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1184 if (!netdev_dev->have_vport_stats) {
1185 if (use_netlink_stats < 0) {
1186 use_netlink_stats = check_for_working_netlink_stats();
1188 if (use_netlink_stats) {
1191 error = get_ifindex(netdev_, &ifindex);
1193 error = get_stats_via_netlink(ifindex, stats);
1196 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1200 /* If this port is an internal port then the transmit and receive stats
1201 * will appear to be swapped relative to the other ports since we are the
1202 * one sending the data, not a remote computer. For consistency, we swap
1203 * them back here. This does not apply if we are getting stats from the
1204 * vport layer because it always tracks stats from the perspective of the
1206 netdev_linux_update_is_pseudo(netdev_dev);
1207 if (!error && !netdev_dev->have_vport_stats &&
1208 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1209 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1210 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1211 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1212 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1213 stats->rx_length_errors = 0;
1214 stats->rx_over_errors = 0;
1215 stats->rx_crc_errors = 0;
1216 stats->rx_frame_errors = 0;
1217 stats->rx_fifo_errors = 0;
1218 stats->rx_missed_errors = 0;
1219 stats->tx_aborted_errors = 0;
1220 stats->tx_carrier_errors = 0;
1221 stats->tx_fifo_errors = 0;
1222 stats->tx_heartbeat_errors = 0;
1223 stats->tx_window_errors = 0;
1229 /* Stores the features supported by 'netdev' into each of '*current',
1230 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1231 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1232 * successful, otherwise a positive errno value. */
1234 netdev_linux_get_features(const struct netdev *netdev,
1235 uint32_t *current, uint32_t *advertised,
1236 uint32_t *supported, uint32_t *peer)
1238 struct ethtool_cmd ecmd;
1241 memset(&ecmd, 0, sizeof ecmd);
1242 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1243 ETHTOOL_GSET, "ETHTOOL_GSET");
1248 /* Supported features. */
1250 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1251 *supported |= OFPPF_10MB_HD;
1253 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1254 *supported |= OFPPF_10MB_FD;
1256 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1257 *supported |= OFPPF_100MB_HD;
1259 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1260 *supported |= OFPPF_100MB_FD;
1262 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1263 *supported |= OFPPF_1GB_HD;
1265 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1266 *supported |= OFPPF_1GB_FD;
1268 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1269 *supported |= OFPPF_10GB_FD;
1271 if (ecmd.supported & SUPPORTED_TP) {
1272 *supported |= OFPPF_COPPER;
1274 if (ecmd.supported & SUPPORTED_FIBRE) {
1275 *supported |= OFPPF_FIBER;
1277 if (ecmd.supported & SUPPORTED_Autoneg) {
1278 *supported |= OFPPF_AUTONEG;
1280 if (ecmd.supported & SUPPORTED_Pause) {
1281 *supported |= OFPPF_PAUSE;
1283 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1284 *supported |= OFPPF_PAUSE_ASYM;
1287 /* Advertised features. */
1289 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1290 *advertised |= OFPPF_10MB_HD;
1292 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1293 *advertised |= OFPPF_10MB_FD;
1295 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1296 *advertised |= OFPPF_100MB_HD;
1298 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1299 *advertised |= OFPPF_100MB_FD;
1301 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1302 *advertised |= OFPPF_1GB_HD;
1304 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1305 *advertised |= OFPPF_1GB_FD;
1307 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1308 *advertised |= OFPPF_10GB_FD;
1310 if (ecmd.advertising & ADVERTISED_TP) {
1311 *advertised |= OFPPF_COPPER;
1313 if (ecmd.advertising & ADVERTISED_FIBRE) {
1314 *advertised |= OFPPF_FIBER;
1316 if (ecmd.advertising & ADVERTISED_Autoneg) {
1317 *advertised |= OFPPF_AUTONEG;
1319 if (ecmd.advertising & ADVERTISED_Pause) {
1320 *advertised |= OFPPF_PAUSE;
1322 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1323 *advertised |= OFPPF_PAUSE_ASYM;
1326 /* Current settings. */
1327 if (ecmd.speed == SPEED_10) {
1328 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1329 } else if (ecmd.speed == SPEED_100) {
1330 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1331 } else if (ecmd.speed == SPEED_1000) {
1332 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1333 } else if (ecmd.speed == SPEED_10000) {
1334 *current = OFPPF_10GB_FD;
1339 if (ecmd.port == PORT_TP) {
1340 *current |= OFPPF_COPPER;
1341 } else if (ecmd.port == PORT_FIBRE) {
1342 *current |= OFPPF_FIBER;
1346 *current |= OFPPF_AUTONEG;
1349 /* Peer advertisements. */
1350 *peer = 0; /* XXX */
1355 /* Set the features advertised by 'netdev' to 'advertise'. */
1357 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1359 struct ethtool_cmd ecmd;
1362 memset(&ecmd, 0, sizeof ecmd);
1363 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1364 ETHTOOL_GSET, "ETHTOOL_GSET");
1369 ecmd.advertising = 0;
1370 if (advertise & OFPPF_10MB_HD) {
1371 ecmd.advertising |= ADVERTISED_10baseT_Half;
1373 if (advertise & OFPPF_10MB_FD) {
1374 ecmd.advertising |= ADVERTISED_10baseT_Full;
1376 if (advertise & OFPPF_100MB_HD) {
1377 ecmd.advertising |= ADVERTISED_100baseT_Half;
1379 if (advertise & OFPPF_100MB_FD) {
1380 ecmd.advertising |= ADVERTISED_100baseT_Full;
1382 if (advertise & OFPPF_1GB_HD) {
1383 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1385 if (advertise & OFPPF_1GB_FD) {
1386 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1388 if (advertise & OFPPF_10GB_FD) {
1389 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1391 if (advertise & OFPPF_COPPER) {
1392 ecmd.advertising |= ADVERTISED_TP;
1394 if (advertise & OFPPF_FIBER) {
1395 ecmd.advertising |= ADVERTISED_FIBRE;
1397 if (advertise & OFPPF_AUTONEG) {
1398 ecmd.advertising |= ADVERTISED_Autoneg;
1400 if (advertise & OFPPF_PAUSE) {
1401 ecmd.advertising |= ADVERTISED_Pause;
1403 if (advertise & OFPPF_PAUSE_ASYM) {
1404 ecmd.advertising |= ADVERTISED_Asym_Pause;
1406 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1407 ETHTOOL_SSET, "ETHTOOL_SSET");
1410 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1411 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1412 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1413 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1414 * sets '*vlan_vid' to -1. */
1416 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1418 const char *netdev_name = netdev_get_name(netdev);
1419 struct ds line = DS_EMPTY_INITIALIZER;
1420 FILE *stream = NULL;
1424 COVERAGE_INC(netdev_get_vlan_vid);
1425 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1426 stream = fopen(fn, "r");
1432 if (ds_get_line(&line, stream)) {
1433 if (ferror(stream)) {
1435 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1438 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1443 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1445 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1446 fn, ds_cstr(&line));
1464 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1465 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1467 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1468 * positive errno value.
1470 * This function is equivalent to running
1471 * /sbin/tc qdisc del dev %s handle ffff: ingress
1472 * but it is much, much faster.
1475 netdev_linux_remove_policing(struct netdev *netdev)
1477 struct netdev_dev_linux *netdev_dev =
1478 netdev_dev_linux_cast(netdev_get_dev(netdev));
1479 const char *netdev_name = netdev_get_name(netdev);
1481 struct ofpbuf request;
1482 struct tcmsg *tcmsg;
1485 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1489 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1490 tcmsg->tcm_parent = TC_H_INGRESS;
1491 nl_msg_put_string(&request, TCA_KIND, "ingress");
1492 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1494 error = tc_transact(&request, NULL);
1495 if (error && error != ENOENT && error != EINVAL) {
1496 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1497 netdev_name, strerror(error));
1501 netdev_dev->kbits_rate = 0;
1502 netdev_dev->kbits_burst = 0;
1503 netdev_dev->cache_valid |= VALID_POLICING;
1507 /* Attempts to set input rate limiting (policing) policy. */
1509 netdev_linux_set_policing(struct netdev *netdev,
1510 uint32_t kbits_rate, uint32_t kbits_burst)
1512 struct netdev_dev_linux *netdev_dev =
1513 netdev_dev_linux_cast(netdev_get_dev(netdev));
1514 const char *netdev_name = netdev_get_name(netdev);
1517 COVERAGE_INC(netdev_set_policing);
1519 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1520 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1521 : kbits_burst); /* Stick with user-specified value. */
1523 if (netdev_dev->cache_valid & VALID_POLICING
1524 && netdev_dev->kbits_rate == kbits_rate
1525 && netdev_dev->kbits_burst == kbits_burst) {
1526 /* Assume that settings haven't changed since we last set them. */
1530 netdev_linux_remove_policing(netdev);
1532 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1533 if (system(command) != 0) {
1534 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1538 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1539 kbits_rate, kbits_burst);
1540 if (system(command) != 0) {
1541 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1546 netdev_dev->kbits_rate = kbits_rate;
1547 netdev_dev->kbits_burst = kbits_burst;
1548 netdev_dev->cache_valid |= VALID_POLICING;
1555 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1558 const struct tc_ops **opsp;
1560 for (opsp = tcs; *opsp != NULL; opsp++) {
1561 const struct tc_ops *ops = *opsp;
1562 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1563 sset_add(types, ops->ovs_name);
1569 static const struct tc_ops *
1570 tc_lookup_ovs_name(const char *name)
1572 const struct tc_ops **opsp;
1574 for (opsp = tcs; *opsp != NULL; opsp++) {
1575 const struct tc_ops *ops = *opsp;
1576 if (!strcmp(name, ops->ovs_name)) {
1583 static const struct tc_ops *
1584 tc_lookup_linux_name(const char *name)
1586 const struct tc_ops **opsp;
1588 for (opsp = tcs; *opsp != NULL; opsp++) {
1589 const struct tc_ops *ops = *opsp;
1590 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1597 static struct tc_queue *
1598 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1601 struct netdev_dev_linux *netdev_dev =
1602 netdev_dev_linux_cast(netdev_get_dev(netdev));
1603 struct tc_queue *queue;
1605 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1606 if (queue->queue_id == queue_id) {
1613 static struct tc_queue *
1614 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1616 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1620 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1622 struct netdev_qos_capabilities *caps)
1624 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1628 caps->n_queues = ops->n_queues;
1633 netdev_linux_get_qos(const struct netdev *netdev,
1634 const char **typep, struct shash *details)
1636 struct netdev_dev_linux *netdev_dev =
1637 netdev_dev_linux_cast(netdev_get_dev(netdev));
1640 error = tc_query_qdisc(netdev);
1645 *typep = netdev_dev->tc->ops->ovs_name;
1646 return (netdev_dev->tc->ops->qdisc_get
1647 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1652 netdev_linux_set_qos(struct netdev *netdev,
1653 const char *type, const struct shash *details)
1655 struct netdev_dev_linux *netdev_dev =
1656 netdev_dev_linux_cast(netdev_get_dev(netdev));
1657 const struct tc_ops *new_ops;
1660 new_ops = tc_lookup_ovs_name(type);
1661 if (!new_ops || !new_ops->tc_install) {
1665 error = tc_query_qdisc(netdev);
1670 if (new_ops == netdev_dev->tc->ops) {
1671 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1673 /* Delete existing qdisc. */
1674 error = tc_del_qdisc(netdev);
1678 assert(netdev_dev->tc == NULL);
1680 /* Install new qdisc. */
1681 error = new_ops->tc_install(netdev, details);
1682 assert((error == 0) == (netdev_dev->tc != NULL));
1689 netdev_linux_get_queue(const struct netdev *netdev,
1690 unsigned int queue_id, struct shash *details)
1692 struct netdev_dev_linux *netdev_dev =
1693 netdev_dev_linux_cast(netdev_get_dev(netdev));
1696 error = tc_query_qdisc(netdev);
1700 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1702 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1708 netdev_linux_set_queue(struct netdev *netdev,
1709 unsigned int queue_id, const struct shash *details)
1711 struct netdev_dev_linux *netdev_dev =
1712 netdev_dev_linux_cast(netdev_get_dev(netdev));
1715 error = tc_query_qdisc(netdev);
1718 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1719 || !netdev_dev->tc->ops->class_set) {
1723 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1727 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1729 struct netdev_dev_linux *netdev_dev =
1730 netdev_dev_linux_cast(netdev_get_dev(netdev));
1733 error = tc_query_qdisc(netdev);
1736 } else if (!netdev_dev->tc->ops->class_delete) {
1739 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1741 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1747 netdev_linux_get_queue_stats(const struct netdev *netdev,
1748 unsigned int queue_id,
1749 struct netdev_queue_stats *stats)
1751 struct netdev_dev_linux *netdev_dev =
1752 netdev_dev_linux_cast(netdev_get_dev(netdev));
1755 error = tc_query_qdisc(netdev);
1758 } else if (!netdev_dev->tc->ops->class_get_stats) {
1761 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1763 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1769 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1771 struct ofpbuf request;
1772 struct tcmsg *tcmsg;
1774 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1778 tcmsg->tcm_parent = 0;
1779 nl_dump_start(dump, rtnl_sock, &request);
1780 ofpbuf_uninit(&request);
1785 netdev_linux_dump_queues(const struct netdev *netdev,
1786 netdev_dump_queues_cb *cb, void *aux)
1788 struct netdev_dev_linux *netdev_dev =
1789 netdev_dev_linux_cast(netdev_get_dev(netdev));
1790 struct tc_queue *queue;
1791 struct shash details;
1795 error = tc_query_qdisc(netdev);
1798 } else if (!netdev_dev->tc->ops->class_get) {
1803 shash_init(&details);
1804 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1805 shash_clear(&details);
1807 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1809 (*cb)(queue->queue_id, &details, aux);
1814 shash_destroy(&details);
1820 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1821 netdev_dump_queue_stats_cb *cb, void *aux)
1823 struct netdev_dev_linux *netdev_dev =
1824 netdev_dev_linux_cast(netdev_get_dev(netdev));
1825 struct nl_dump dump;
1830 error = tc_query_qdisc(netdev);
1833 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1838 if (!start_queue_dump(netdev, &dump)) {
1841 while (nl_dump_next(&dump, &msg)) {
1842 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1848 error = nl_dump_done(&dump);
1849 return error ? error : last_error;
1853 netdev_linux_get_in4(const struct netdev *netdev_,
1854 struct in_addr *address, struct in_addr *netmask)
1856 struct netdev_dev_linux *netdev_dev =
1857 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1859 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1862 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1863 SIOCGIFADDR, "SIOCGIFADDR");
1868 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1869 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1874 netdev_dev->cache_valid |= VALID_IN4;
1876 *address = netdev_dev->address;
1877 *netmask = netdev_dev->netmask;
1878 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1882 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1883 struct in_addr netmask)
1885 struct netdev_dev_linux *netdev_dev =
1886 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1889 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1891 netdev_dev->cache_valid |= VALID_IN4;
1892 netdev_dev->address = address;
1893 netdev_dev->netmask = netmask;
1894 if (address.s_addr != INADDR_ANY) {
1895 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1896 "SIOCSIFNETMASK", netmask);
1903 parse_if_inet6_line(const char *line,
1904 struct in6_addr *in6, char ifname[16 + 1])
1906 uint8_t *s6 = in6->s6_addr;
1907 #define X8 "%2"SCNx8
1909 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1910 "%*x %*x %*x %*x %16s\n",
1911 &s6[0], &s6[1], &s6[2], &s6[3],
1912 &s6[4], &s6[5], &s6[6], &s6[7],
1913 &s6[8], &s6[9], &s6[10], &s6[11],
1914 &s6[12], &s6[13], &s6[14], &s6[15],
1918 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1919 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1921 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1923 struct netdev_dev_linux *netdev_dev =
1924 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1925 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1929 netdev_dev->in6 = in6addr_any;
1931 file = fopen("/proc/net/if_inet6", "r");
1933 const char *name = netdev_get_name(netdev_);
1934 while (fgets(line, sizeof line, file)) {
1935 struct in6_addr in6_tmp;
1936 char ifname[16 + 1];
1937 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1938 && !strcmp(name, ifname))
1940 netdev_dev->in6 = in6_tmp;
1946 netdev_dev->cache_valid |= VALID_IN6;
1948 *in6 = netdev_dev->in6;
1953 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1955 struct sockaddr_in sin;
1956 memset(&sin, 0, sizeof sin);
1957 sin.sin_family = AF_INET;
1958 sin.sin_addr = addr;
1961 memset(sa, 0, sizeof *sa);
1962 memcpy(sa, &sin, sizeof sin);
1966 do_set_addr(struct netdev *netdev,
1967 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1970 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1971 make_in4_sockaddr(&ifr.ifr_addr, addr);
1973 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1977 /* Adds 'router' as a default IP gateway. */
1979 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1981 struct in_addr any = { INADDR_ANY };
1985 memset(&rt, 0, sizeof rt);
1986 make_in4_sockaddr(&rt.rt_dst, any);
1987 make_in4_sockaddr(&rt.rt_gateway, router);
1988 make_in4_sockaddr(&rt.rt_genmask, any);
1989 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1990 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1992 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1998 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2001 static const char fn[] = "/proc/net/route";
2006 *netdev_name = NULL;
2007 stream = fopen(fn, "r");
2008 if (stream == NULL) {
2009 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2014 while (fgets(line, sizeof line, stream)) {
2017 uint32_t dest, gateway, mask;
2018 int refcnt, metric, mtu;
2019 unsigned int flags, use, window, irtt;
2022 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2024 iface, &dest, &gateway, &flags, &refcnt,
2025 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2027 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2031 if (!(flags & RTF_UP)) {
2032 /* Skip routes that aren't up. */
2036 /* The output of 'dest', 'mask', and 'gateway' were given in
2037 * network byte order, so we don't need need any endian
2038 * conversions here. */
2039 if ((dest & mask) == (host->s_addr & mask)) {
2041 /* The host is directly reachable. */
2042 next_hop->s_addr = 0;
2044 /* To reach the host, we must go through a gateway. */
2045 next_hop->s_addr = gateway;
2047 *netdev_name = xstrdup(iface);
2059 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2061 struct ethtool_drvinfo drvinfo;
2064 memset(&drvinfo, 0, sizeof drvinfo);
2065 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2066 (struct ethtool_cmd *)&drvinfo,
2068 "ETHTOOL_GDRVINFO");
2070 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2071 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2072 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2078 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2079 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2080 * returns 0. Otherwise, it returns a positive errno value; in particular,
2081 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2083 netdev_linux_arp_lookup(const struct netdev *netdev,
2084 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2087 struct sockaddr_in sin;
2090 memset(&r, 0, sizeof r);
2091 memset(&sin, 0, sizeof sin);
2092 sin.sin_family = AF_INET;
2093 sin.sin_addr.s_addr = ip;
2095 memcpy(&r.arp_pa, &sin, sizeof sin);
2096 r.arp_ha.sa_family = ARPHRD_ETHER;
2098 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2099 COVERAGE_INC(netdev_arp_lookup);
2100 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2102 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2103 } else if (retval != ENXIO) {
2104 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2105 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2111 nd_to_iff_flags(enum netdev_flags nd)
2114 if (nd & NETDEV_UP) {
2117 if (nd & NETDEV_PROMISC) {
2124 iff_to_nd_flags(int iff)
2126 enum netdev_flags nd = 0;
2130 if (iff & IFF_PROMISC) {
2131 nd |= NETDEV_PROMISC;
2137 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2138 enum netdev_flags on, enum netdev_flags *old_flagsp)
2140 int old_flags, new_flags;
2143 error = get_flags(netdev, &old_flags);
2145 *old_flagsp = iff_to_nd_flags(old_flags);
2146 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2147 if (new_flags != old_flags) {
2148 error = set_flags(netdev, new_flags);
2155 poll_notify(struct list *list)
2157 struct netdev_linux_notifier *notifier;
2158 LIST_FOR_EACH (notifier, node, list) {
2159 struct netdev_notifier *n = ¬ifier->notifier;
2165 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2166 void *aux OVS_UNUSED)
2169 struct list *list = shash_find_data(&netdev_linux_notifiers,
2175 struct shash_node *node;
2176 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2177 poll_notify(node->data);
2183 netdev_linux_poll_add(struct netdev *netdev,
2184 void (*cb)(struct netdev_notifier *), void *aux,
2185 struct netdev_notifier **notifierp)
2187 const char *netdev_name = netdev_get_name(netdev);
2188 struct netdev_linux_notifier *notifier;
2191 if (shash_is_empty(&netdev_linux_notifiers)) {
2193 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2194 netdev_linux_poll_cb, NULL);
2200 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2202 list = xmalloc(sizeof *list);
2204 shash_add(&netdev_linux_notifiers, netdev_name, list);
2207 notifier = xmalloc(sizeof *notifier);
2208 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2209 list_push_back(list, ¬ifier->node);
2210 *notifierp = ¬ifier->notifier;
2215 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2217 struct netdev_linux_notifier *notifier =
2218 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2221 /* Remove 'notifier' from its list. */
2222 list = list_remove(¬ifier->node);
2223 if (list_is_empty(list)) {
2224 /* The list is now empty. Remove it from the hash and free it. */
2225 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2226 shash_delete(&netdev_linux_notifiers,
2227 shash_find(&netdev_linux_notifiers, netdev_name));
2232 /* If that was the last notifier, unregister. */
2233 if (shash_is_empty(&netdev_linux_notifiers)) {
2234 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2238 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2242 netdev_linux_init, \
2244 netdev_linux_wait, \
2247 netdev_linux_destroy, \
2248 NULL, /* set_config */ \
2250 netdev_linux_open, \
2251 netdev_linux_close, \
2255 netdev_linux_recv, \
2256 netdev_linux_recv_wait, \
2257 netdev_linux_drain, \
2259 netdev_linux_send, \
2260 netdev_linux_send_wait, \
2262 netdev_linux_set_etheraddr, \
2263 netdev_linux_get_etheraddr, \
2264 netdev_linux_get_mtu, \
2265 netdev_linux_get_ifindex, \
2266 netdev_linux_get_carrier, \
2267 netdev_linux_get_miimon, \
2268 netdev_linux_get_stats, \
2271 netdev_linux_get_features, \
2272 netdev_linux_set_advertisements, \
2273 netdev_linux_get_vlan_vid, \
2275 netdev_linux_set_policing, \
2276 netdev_linux_get_qos_types, \
2277 netdev_linux_get_qos_capabilities, \
2278 netdev_linux_get_qos, \
2279 netdev_linux_set_qos, \
2280 netdev_linux_get_queue, \
2281 netdev_linux_set_queue, \
2282 netdev_linux_delete_queue, \
2283 netdev_linux_get_queue_stats, \
2284 netdev_linux_dump_queues, \
2285 netdev_linux_dump_queue_stats, \
2287 netdev_linux_get_in4, \
2288 netdev_linux_set_in4, \
2289 netdev_linux_get_in6, \
2290 netdev_linux_add_router, \
2291 netdev_linux_get_next_hop, \
2292 netdev_linux_get_status, \
2293 netdev_linux_arp_lookup, \
2295 netdev_linux_update_flags, \
2297 netdev_linux_poll_add, \
2298 netdev_linux_poll_remove \
2301 const struct netdev_class netdev_linux_class =
2304 netdev_linux_create,
2305 netdev_linux_enumerate,
2306 NULL); /* set_stats */
2308 const struct netdev_class netdev_tap_class =
2311 netdev_linux_create_tap,
2312 NULL, /* enumerate */
2313 NULL); /* set_stats */
2315 const struct netdev_class netdev_internal_class =
2318 netdev_linux_create,
2319 NULL, /* enumerate */
2320 netdev_vport_set_stats);
2322 /* HTB traffic control class. */
2324 #define HTB_N_QUEUES 0xf000
2328 unsigned int max_rate; /* In bytes/s. */
2332 struct tc_queue tc_queue;
2333 unsigned int min_rate; /* In bytes/s. */
2334 unsigned int max_rate; /* In bytes/s. */
2335 unsigned int burst; /* In bytes. */
2336 unsigned int priority; /* Lower values are higher priorities. */
2340 htb_get__(const struct netdev *netdev)
2342 struct netdev_dev_linux *netdev_dev =
2343 netdev_dev_linux_cast(netdev_get_dev(netdev));
2344 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2348 htb_install__(struct netdev *netdev, uint64_t max_rate)
2350 struct netdev_dev_linux *netdev_dev =
2351 netdev_dev_linux_cast(netdev_get_dev(netdev));
2354 htb = xmalloc(sizeof *htb);
2355 tc_init(&htb->tc, &tc_ops_htb);
2356 htb->max_rate = max_rate;
2358 netdev_dev->tc = &htb->tc;
2361 /* Create an HTB qdisc.
2363 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2365 htb_setup_qdisc__(struct netdev *netdev)
2368 struct tc_htb_glob opt;
2369 struct ofpbuf request;
2370 struct tcmsg *tcmsg;
2372 tc_del_qdisc(netdev);
2374 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2375 NLM_F_EXCL | NLM_F_CREATE, &request);
2379 tcmsg->tcm_handle = tc_make_handle(1, 0);
2380 tcmsg->tcm_parent = TC_H_ROOT;
2382 nl_msg_put_string(&request, TCA_KIND, "htb");
2384 memset(&opt, 0, sizeof opt);
2385 opt.rate2quantum = 10;
2389 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2390 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2391 nl_msg_end_nested(&request, opt_offset);
2393 return tc_transact(&request, NULL);
2396 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2397 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2399 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2400 unsigned int parent, struct htb_class *class)
2403 struct tc_htb_opt opt;
2404 struct ofpbuf request;
2405 struct tcmsg *tcmsg;
2409 netdev_get_mtu(netdev, &mtu);
2410 if (mtu == INT_MAX) {
2411 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2412 netdev_get_name(netdev));
2416 memset(&opt, 0, sizeof opt);
2417 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2418 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2419 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2420 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2421 opt.prio = class->priority;
2423 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2427 tcmsg->tcm_handle = handle;
2428 tcmsg->tcm_parent = parent;
2430 nl_msg_put_string(&request, TCA_KIND, "htb");
2431 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2432 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2433 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2434 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2435 nl_msg_end_nested(&request, opt_offset);
2437 error = tc_transact(&request, NULL);
2439 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2440 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2441 netdev_get_name(netdev),
2442 tc_get_major(handle), tc_get_minor(handle),
2443 tc_get_major(parent), tc_get_minor(parent),
2444 class->min_rate, class->max_rate,
2445 class->burst, class->priority, strerror(error));
2450 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2451 * description of them into 'details'. The description complies with the
2452 * specification given in the vswitch database documentation for linux-htb
2455 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2457 static const struct nl_policy tca_htb_policy[] = {
2458 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2459 .min_len = sizeof(struct tc_htb_opt) },
2462 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2463 const struct tc_htb_opt *htb;
2465 if (!nl_parse_nested(nl_options, tca_htb_policy,
2466 attrs, ARRAY_SIZE(tca_htb_policy))) {
2467 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2471 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2472 class->min_rate = htb->rate.rate;
2473 class->max_rate = htb->ceil.rate;
2474 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2475 class->priority = htb->prio;
2480 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2481 struct htb_class *options,
2482 struct netdev_queue_stats *stats)
2484 struct nlattr *nl_options;
2485 unsigned int handle;
2488 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2489 if (!error && queue_id) {
2490 unsigned int major = tc_get_major(handle);
2491 unsigned int minor = tc_get_minor(handle);
2492 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2493 *queue_id = minor - 1;
2498 if (!error && options) {
2499 error = htb_parse_tca_options__(nl_options, options);
2505 htb_parse_qdisc_details__(struct netdev *netdev,
2506 const struct shash *details, struct htb_class *hc)
2508 const char *max_rate_s;
2510 max_rate_s = shash_find_data(details, "max-rate");
2511 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2512 if (!hc->max_rate) {
2515 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2516 hc->max_rate = netdev_features_to_bps(current) / 8;
2518 hc->min_rate = hc->max_rate;
2524 htb_parse_class_details__(struct netdev *netdev,
2525 const struct shash *details, struct htb_class *hc)
2527 const struct htb *htb = htb_get__(netdev);
2528 const char *min_rate_s = shash_find_data(details, "min-rate");
2529 const char *max_rate_s = shash_find_data(details, "max-rate");
2530 const char *burst_s = shash_find_data(details, "burst");
2531 const char *priority_s = shash_find_data(details, "priority");
2534 netdev_get_mtu(netdev, &mtu);
2535 if (mtu == INT_MAX) {
2536 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2537 netdev_get_name(netdev));
2541 /* HTB requires at least an mtu sized min-rate to send any traffic even
2542 * on uncongested links. */
2543 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2544 hc->min_rate = MAX(hc->min_rate, mtu);
2545 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2548 hc->max_rate = (max_rate_s
2549 ? strtoull(max_rate_s, NULL, 10) / 8
2551 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2552 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2556 * According to hints in the documentation that I've read, it is important
2557 * that 'burst' be at least as big as the largest frame that might be
2558 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2559 * but having it a bit too small is a problem. Since netdev_get_mtu()
2560 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2561 * the MTU. We actually add 64, instead of 14, as a guard against
2562 * additional headers get tacked on somewhere that we're not aware of. */
2563 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2564 hc->burst = MAX(hc->burst, mtu + 64);
2567 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2573 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2574 unsigned int parent, struct htb_class *options,
2575 struct netdev_queue_stats *stats)
2577 struct ofpbuf *reply;
2580 error = tc_query_class(netdev, handle, parent, &reply);
2582 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2583 ofpbuf_delete(reply);
2589 htb_tc_install(struct netdev *netdev, const struct shash *details)
2593 error = htb_setup_qdisc__(netdev);
2595 struct htb_class hc;
2597 htb_parse_qdisc_details__(netdev, details, &hc);
2598 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2599 tc_make_handle(1, 0), &hc);
2601 htb_install__(netdev, hc.max_rate);
2607 static struct htb_class *
2608 htb_class_cast__(const struct tc_queue *queue)
2610 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2614 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2615 const struct htb_class *hc)
2617 struct htb *htb = htb_get__(netdev);
2618 size_t hash = hash_int(queue_id, 0);
2619 struct tc_queue *queue;
2620 struct htb_class *hcp;
2622 queue = tc_find_queue__(netdev, queue_id, hash);
2624 hcp = htb_class_cast__(queue);
2626 hcp = xmalloc(sizeof *hcp);
2627 queue = &hcp->tc_queue;
2628 queue->queue_id = queue_id;
2629 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2632 hcp->min_rate = hc->min_rate;
2633 hcp->max_rate = hc->max_rate;
2634 hcp->burst = hc->burst;
2635 hcp->priority = hc->priority;
2639 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2642 struct nl_dump dump;
2643 struct htb_class hc;
2645 /* Get qdisc options. */
2647 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2648 htb_install__(netdev, hc.max_rate);
2651 if (!start_queue_dump(netdev, &dump)) {
2654 while (nl_dump_next(&dump, &msg)) {
2655 unsigned int queue_id;
2657 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2658 htb_update_queue__(netdev, queue_id, &hc);
2661 nl_dump_done(&dump);
2667 htb_tc_destroy(struct tc *tc)
2669 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2670 struct htb_class *hc, *next;
2672 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2673 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2681 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2683 const struct htb *htb = htb_get__(netdev);
2684 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2689 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2691 struct htb_class hc;
2694 htb_parse_qdisc_details__(netdev, details, &hc);
2695 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2696 tc_make_handle(1, 0), &hc);
2698 htb_get__(netdev)->max_rate = hc.max_rate;
2704 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2705 const struct tc_queue *queue, struct shash *details)
2707 const struct htb_class *hc = htb_class_cast__(queue);
2709 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2710 if (hc->min_rate != hc->max_rate) {
2711 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2713 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2715 shash_add(details, "priority", xasprintf("%u", hc->priority));
2721 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2722 const struct shash *details)
2724 struct htb_class hc;
2727 error = htb_parse_class_details__(netdev, details, &hc);
2732 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2733 tc_make_handle(1, 0xfffe), &hc);
2738 htb_update_queue__(netdev, queue_id, &hc);
2743 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2745 struct htb_class *hc = htb_class_cast__(queue);
2746 struct htb *htb = htb_get__(netdev);
2749 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2751 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2758 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2759 struct netdev_queue_stats *stats)
2761 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2762 tc_make_handle(1, 0xfffe), NULL, stats);
2766 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2767 const struct ofpbuf *nlmsg,
2768 netdev_dump_queue_stats_cb *cb, void *aux)
2770 struct netdev_queue_stats stats;
2771 unsigned int handle, major, minor;
2774 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2779 major = tc_get_major(handle);
2780 minor = tc_get_minor(handle);
2781 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2782 (*cb)(minor - 1, &stats, aux);
2787 static const struct tc_ops tc_ops_htb = {
2788 "htb", /* linux_name */
2789 "linux-htb", /* ovs_name */
2790 HTB_N_QUEUES, /* n_queues */
2799 htb_class_get_stats,
2800 htb_class_dump_stats
2803 /* "linux-hfsc" traffic control class. */
2805 #define HFSC_N_QUEUES 0xf000
2813 struct tc_queue tc_queue;
2818 static struct hfsc *
2819 hfsc_get__(const struct netdev *netdev)
2821 struct netdev_dev_linux *netdev_dev;
2822 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2823 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2826 static struct hfsc_class *
2827 hfsc_class_cast__(const struct tc_queue *queue)
2829 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2833 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2835 struct netdev_dev_linux * netdev_dev;
2838 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2839 hfsc = xmalloc(sizeof *hfsc);
2840 tc_init(&hfsc->tc, &tc_ops_hfsc);
2841 hfsc->max_rate = max_rate;
2842 netdev_dev->tc = &hfsc->tc;
2846 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2847 const struct hfsc_class *hc)
2851 struct hfsc_class *hcp;
2852 struct tc_queue *queue;
2854 hfsc = hfsc_get__(netdev);
2855 hash = hash_int(queue_id, 0);
2857 queue = tc_find_queue__(netdev, queue_id, hash);
2859 hcp = hfsc_class_cast__(queue);
2861 hcp = xmalloc(sizeof *hcp);
2862 queue = &hcp->tc_queue;
2863 queue->queue_id = queue_id;
2864 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2867 hcp->min_rate = hc->min_rate;
2868 hcp->max_rate = hc->max_rate;
2872 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2874 const struct tc_service_curve *rsc, *fsc, *usc;
2875 static const struct nl_policy tca_hfsc_policy[] = {
2877 .type = NL_A_UNSPEC,
2879 .min_len = sizeof(struct tc_service_curve),
2882 .type = NL_A_UNSPEC,
2884 .min_len = sizeof(struct tc_service_curve),
2887 .type = NL_A_UNSPEC,
2889 .min_len = sizeof(struct tc_service_curve),
2892 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2894 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2895 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2896 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2900 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2901 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2902 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2904 if (rsc->m1 != 0 || rsc->d != 0 ||
2905 fsc->m1 != 0 || fsc->d != 0 ||
2906 usc->m1 != 0 || usc->d != 0) {
2907 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2908 "Non-linear service curves are not supported.");
2912 if (rsc->m2 != fsc->m2) {
2913 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2914 "Real-time service curves are not supported ");
2918 if (rsc->m2 > usc->m2) {
2919 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2920 "Min-rate service curve is greater than "
2921 "the max-rate service curve.");
2925 class->min_rate = fsc->m2;
2926 class->max_rate = usc->m2;
2931 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2932 struct hfsc_class *options,
2933 struct netdev_queue_stats *stats)
2936 unsigned int handle;
2937 struct nlattr *nl_options;
2939 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2945 unsigned int major, minor;
2947 major = tc_get_major(handle);
2948 minor = tc_get_minor(handle);
2949 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2950 *queue_id = minor - 1;
2957 error = hfsc_parse_tca_options__(nl_options, options);
2964 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2965 unsigned int parent, struct hfsc_class *options,
2966 struct netdev_queue_stats *stats)
2969 struct ofpbuf *reply;
2971 error = tc_query_class(netdev, handle, parent, &reply);
2976 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2977 ofpbuf_delete(reply);
2982 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2983 struct hfsc_class *class)
2986 const char *max_rate_s;
2988 max_rate_s = shash_find_data(details, "max-rate");
2989 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2994 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2995 max_rate = netdev_features_to_bps(current) / 8;
2998 class->min_rate = max_rate;
2999 class->max_rate = max_rate;
3003 hfsc_parse_class_details__(struct netdev *netdev,
3004 const struct shash *details,
3005 struct hfsc_class * class)
3007 const struct hfsc *hfsc;
3008 uint32_t min_rate, max_rate;
3009 const char *min_rate_s, *max_rate_s;
3011 hfsc = hfsc_get__(netdev);
3012 min_rate_s = shash_find_data(details, "min-rate");
3013 max_rate_s = shash_find_data(details, "max-rate");
3015 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3016 min_rate = MAX(min_rate, 1);
3017 min_rate = MIN(min_rate, hfsc->max_rate);
3019 max_rate = (max_rate_s
3020 ? strtoull(max_rate_s, NULL, 10) / 8
3022 max_rate = MAX(max_rate, min_rate);
3023 max_rate = MIN(max_rate, hfsc->max_rate);
3025 class->min_rate = min_rate;
3026 class->max_rate = max_rate;
3031 /* Create an HFSC qdisc.
3033 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3035 hfsc_setup_qdisc__(struct netdev * netdev)
3037 struct tcmsg *tcmsg;
3038 struct ofpbuf request;
3039 struct tc_hfsc_qopt opt;
3041 tc_del_qdisc(netdev);
3043 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3044 NLM_F_EXCL | NLM_F_CREATE, &request);
3050 tcmsg->tcm_handle = tc_make_handle(1, 0);
3051 tcmsg->tcm_parent = TC_H_ROOT;
3053 memset(&opt, 0, sizeof opt);
3056 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3057 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3059 return tc_transact(&request, NULL);
3062 /* Create an HFSC class.
3064 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3065 * sc rate <min_rate> ul rate <max_rate>" */
3067 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3068 unsigned int parent, struct hfsc_class *class)
3072 struct tcmsg *tcmsg;
3073 struct ofpbuf request;
3074 struct tc_service_curve min, max;
3076 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3082 tcmsg->tcm_handle = handle;
3083 tcmsg->tcm_parent = parent;
3087 min.m2 = class->min_rate;
3091 max.m2 = class->max_rate;
3093 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3094 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3095 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3096 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3097 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3098 nl_msg_end_nested(&request, opt_offset);
3100 error = tc_transact(&request, NULL);
3102 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3103 "min-rate %ubps, max-rate %ubps (%s)",
3104 netdev_get_name(netdev),
3105 tc_get_major(handle), tc_get_minor(handle),
3106 tc_get_major(parent), tc_get_minor(parent),
3107 class->min_rate, class->max_rate, strerror(error));
3114 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3117 struct hfsc_class class;
3119 error = hfsc_setup_qdisc__(netdev);
3125 hfsc_parse_qdisc_details__(netdev, details, &class);
3126 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3127 tc_make_handle(1, 0), &class);
3133 hfsc_install__(netdev, class.max_rate);
3138 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3141 struct nl_dump dump;
3142 struct hfsc_class hc;
3145 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3146 hfsc_install__(netdev, hc.max_rate);
3148 if (!start_queue_dump(netdev, &dump)) {
3152 while (nl_dump_next(&dump, &msg)) {
3153 unsigned int queue_id;
3155 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3156 hfsc_update_queue__(netdev, queue_id, &hc);
3160 nl_dump_done(&dump);
3165 hfsc_tc_destroy(struct tc *tc)
3168 struct hfsc_class *hc, *next;
3170 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3172 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3173 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3182 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3184 const struct hfsc *hfsc;
3185 hfsc = hfsc_get__(netdev);
3186 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3191 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3194 struct hfsc_class class;
3196 hfsc_parse_qdisc_details__(netdev, details, &class);
3197 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3198 tc_make_handle(1, 0), &class);
3201 hfsc_get__(netdev)->max_rate = class.max_rate;
3208 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3209 const struct tc_queue *queue, struct shash *details)
3211 const struct hfsc_class *hc;
3213 hc = hfsc_class_cast__(queue);
3214 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3215 if (hc->min_rate != hc->max_rate) {
3216 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3222 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3223 const struct shash *details)
3226 struct hfsc_class class;
3228 error = hfsc_parse_class_details__(netdev, details, &class);
3233 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3234 tc_make_handle(1, 0xfffe), &class);
3239 hfsc_update_queue__(netdev, queue_id, &class);
3244 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3248 struct hfsc_class *hc;
3250 hc = hfsc_class_cast__(queue);
3251 hfsc = hfsc_get__(netdev);
3253 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3255 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3262 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3263 struct netdev_queue_stats *stats)
3265 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3266 tc_make_handle(1, 0xfffe), NULL, stats);
3270 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3271 const struct ofpbuf *nlmsg,
3272 netdev_dump_queue_stats_cb *cb, void *aux)
3274 struct netdev_queue_stats stats;
3275 unsigned int handle, major, minor;
3278 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3283 major = tc_get_major(handle);
3284 minor = tc_get_minor(handle);
3285 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3286 (*cb)(minor - 1, &stats, aux);
3291 static const struct tc_ops tc_ops_hfsc = {
3292 "hfsc", /* linux_name */
3293 "linux-hfsc", /* ovs_name */
3294 HFSC_N_QUEUES, /* n_queues */
3295 hfsc_tc_install, /* tc_install */
3296 hfsc_tc_load, /* tc_load */
3297 hfsc_tc_destroy, /* tc_destroy */
3298 hfsc_qdisc_get, /* qdisc_get */
3299 hfsc_qdisc_set, /* qdisc_set */
3300 hfsc_class_get, /* class_get */
3301 hfsc_class_set, /* class_set */
3302 hfsc_class_delete, /* class_delete */
3303 hfsc_class_get_stats, /* class_get_stats */
3304 hfsc_class_dump_stats /* class_dump_stats */
3307 /* "linux-default" traffic control class.
3309 * This class represents the default, unnamed Linux qdisc. It corresponds to
3310 * the "" (empty string) QoS type in the OVS database. */
3313 default_install__(struct netdev *netdev)
3315 struct netdev_dev_linux *netdev_dev =
3316 netdev_dev_linux_cast(netdev_get_dev(netdev));
3317 static struct tc *tc;
3320 tc = xmalloc(sizeof *tc);
3321 tc_init(tc, &tc_ops_default);
3323 netdev_dev->tc = tc;
3327 default_tc_install(struct netdev *netdev,
3328 const struct shash *details OVS_UNUSED)
3330 default_install__(netdev);
3335 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3337 default_install__(netdev);
3341 static const struct tc_ops tc_ops_default = {
3342 NULL, /* linux_name */
3347 NULL, /* tc_destroy */
3348 NULL, /* qdisc_get */
3349 NULL, /* qdisc_set */
3350 NULL, /* class_get */
3351 NULL, /* class_set */
3352 NULL, /* class_delete */
3353 NULL, /* class_get_stats */
3354 NULL /* class_dump_stats */
3357 /* "linux-other" traffic control class.
3362 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3364 struct netdev_dev_linux *netdev_dev =
3365 netdev_dev_linux_cast(netdev_get_dev(netdev));
3366 static struct tc *tc;
3369 tc = xmalloc(sizeof *tc);
3370 tc_init(tc, &tc_ops_other);
3372 netdev_dev->tc = tc;
3376 static const struct tc_ops tc_ops_other = {
3377 NULL, /* linux_name */
3378 "linux-other", /* ovs_name */
3380 NULL, /* tc_install */
3382 NULL, /* tc_destroy */
3383 NULL, /* qdisc_get */
3384 NULL, /* qdisc_set */
3385 NULL, /* class_get */
3386 NULL, /* class_set */
3387 NULL, /* class_delete */
3388 NULL, /* class_get_stats */
3389 NULL /* class_dump_stats */
3392 /* Traffic control. */
3394 /* Number of kernel "tc" ticks per second. */
3395 static double ticks_per_s;
3397 /* Number of kernel "jiffies" per second. This is used for the purpose of
3398 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3399 * one jiffy's worth of data.
3401 * There are two possibilities here:
3403 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3404 * approximate range of 100 to 1024. That means that we really need to
3405 * make sure that the qdisc can buffer that much data.
3407 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3408 * has finely granular timers and there's no need to fudge additional room
3409 * for buffers. (There's no extra effort needed to implement that: the
3410 * large 'buffer_hz' is used as a divisor, so practically any number will
3411 * come out as 0 in the division. Small integer results in the case of
3412 * really high dividends won't have any real effect anyhow.)
3414 static unsigned int buffer_hz;
3416 /* Returns tc handle 'major':'minor'. */
3418 tc_make_handle(unsigned int major, unsigned int minor)
3420 return TC_H_MAKE(major << 16, minor);
3423 /* Returns the major number from 'handle'. */
3425 tc_get_major(unsigned int handle)
3427 return TC_H_MAJ(handle) >> 16;
3430 /* Returns the minor number from 'handle'. */
3432 tc_get_minor(unsigned int handle)
3434 return TC_H_MIN(handle);
3437 static struct tcmsg *
3438 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3439 struct ofpbuf *request)
3441 struct tcmsg *tcmsg;
3445 error = get_ifindex(netdev, &ifindex);
3450 ofpbuf_init(request, 512);
3451 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3452 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3453 tcmsg->tcm_family = AF_UNSPEC;
3454 tcmsg->tcm_ifindex = ifindex;
3455 /* Caller should fill in tcmsg->tcm_handle. */
3456 /* Caller should fill in tcmsg->tcm_parent. */
3462 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3464 int error = nl_sock_transact(rtnl_sock, request, replyp);
3465 ofpbuf_uninit(request);
3472 /* The values in psched are not individually very meaningful, but they are
3473 * important. The tables below show some values seen in the wild.
3477 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3478 * (Before that, there are hints that it was 1000000000.)
3480 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3484 * -----------------------------------
3485 * [1] 000c8000 000f4240 000f4240 00000064
3486 * [2] 000003e8 00000400 000f4240 3b9aca00
3487 * [3] 000003e8 00000400 000f4240 3b9aca00
3488 * [4] 000003e8 00000400 000f4240 00000064
3489 * [5] 000003e8 00000040 000f4240 3b9aca00
3490 * [6] 000003e8 00000040 000f4240 000000f9
3492 * a b c d ticks_per_s buffer_hz
3493 * ------- --------- ---------- ------------- ----------- -------------
3494 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3495 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3496 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3497 * [4] 1,000 1,024 1,000,000 100 976,562 100
3498 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3499 * [6] 1,000 64 1,000,000 249 15,625,000 249
3501 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3502 * [2] 2.6.26-1-686-bigmem from Debian lenny
3503 * [3] 2.6.26-2-sparc64 from Debian lenny
3504 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3505 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3506 * [6] 2.6.34 from kernel.org on KVM
3508 static const char fn[] = "/proc/net/psched";
3509 unsigned int a, b, c, d;
3515 stream = fopen(fn, "r");
3517 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3521 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3522 VLOG_WARN("%s: read failed", fn);
3526 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3530 VLOG_WARN("%s: invalid scheduler parameters", fn);
3534 ticks_per_s = (double) a * c / b;
3538 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3541 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3544 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3545 * rate of 'rate' bytes per second. */
3547 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3552 return (rate * ticks) / ticks_per_s;
3555 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3556 * rate of 'rate' bytes per second. */
3558 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3563 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3566 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3567 * a transmission rate of 'rate' bytes per second. */
3569 tc_buffer_per_jiffy(unsigned int rate)
3574 return rate / buffer_hz;
3577 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3578 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3579 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3580 * stores NULL into it if it is absent.
3582 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3585 * Returns 0 if successful, otherwise a positive errno value. */
3587 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3588 struct nlattr **options)
3590 static const struct nl_policy tca_policy[] = {
3591 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3592 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3594 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3596 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3597 tca_policy, ta, ARRAY_SIZE(ta))) {
3598 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3603 *kind = nl_attr_get_string(ta[TCA_KIND]);
3607 *options = ta[TCA_OPTIONS];
3622 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3623 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3624 * into '*options', and its queue statistics into '*stats'. Any of the output
3625 * arguments may be null.
3627 * Returns 0 if successful, otherwise a positive errno value. */
3629 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3630 struct nlattr **options, struct netdev_queue_stats *stats)
3632 static const struct nl_policy tca_policy[] = {
3633 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3634 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3636 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3638 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3639 tca_policy, ta, ARRAY_SIZE(ta))) {
3640 VLOG_WARN_RL(&rl, "failed to parse class message");
3645 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3646 *handlep = tc->tcm_handle;
3650 *options = ta[TCA_OPTIONS];
3654 const struct gnet_stats_queue *gsq;
3655 struct gnet_stats_basic gsb;
3657 static const struct nl_policy stats_policy[] = {
3658 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3659 .min_len = sizeof gsb },
3660 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3661 .min_len = sizeof *gsq },
3663 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3665 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3666 sa, ARRAY_SIZE(sa))) {
3667 VLOG_WARN_RL(&rl, "failed to parse class stats");
3671 /* Alignment issues screw up the length of struct gnet_stats_basic on
3672 * some arch/bitsize combinations. Newer versions of Linux have a
3673 * struct gnet_stats_basic_packed, but we can't depend on that. The
3674 * easiest thing to do is just to make a copy. */
3675 memset(&gsb, 0, sizeof gsb);
3676 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3677 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3678 stats->tx_bytes = gsb.bytes;
3679 stats->tx_packets = gsb.packets;
3681 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3682 stats->tx_errors = gsq->drops;
3692 memset(stats, 0, sizeof *stats);
3697 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3700 tc_query_class(const struct netdev *netdev,
3701 unsigned int handle, unsigned int parent,
3702 struct ofpbuf **replyp)
3704 struct ofpbuf request;
3705 struct tcmsg *tcmsg;
3708 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3712 tcmsg->tcm_handle = handle;
3713 tcmsg->tcm_parent = parent;
3715 error = tc_transact(&request, replyp);
3717 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3718 netdev_get_name(netdev),
3719 tc_get_major(handle), tc_get_minor(handle),
3720 tc_get_major(parent), tc_get_minor(parent),
3726 /* Equivalent to "tc class del dev <name> handle <handle>". */
3728 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3730 struct ofpbuf request;
3731 struct tcmsg *tcmsg;
3734 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3738 tcmsg->tcm_handle = handle;
3739 tcmsg->tcm_parent = 0;
3741 error = tc_transact(&request, NULL);
3743 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3744 netdev_get_name(netdev),
3745 tc_get_major(handle), tc_get_minor(handle),
3751 /* Equivalent to "tc qdisc del dev <name> root". */
3753 tc_del_qdisc(struct netdev *netdev)
3755 struct netdev_dev_linux *netdev_dev =
3756 netdev_dev_linux_cast(netdev_get_dev(netdev));
3757 struct ofpbuf request;
3758 struct tcmsg *tcmsg;
3761 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3765 tcmsg->tcm_handle = tc_make_handle(1, 0);
3766 tcmsg->tcm_parent = TC_H_ROOT;
3768 error = tc_transact(&request, NULL);
3769 if (error == EINVAL) {
3770 /* EINVAL probably means that the default qdisc was in use, in which
3771 * case we've accomplished our purpose. */
3774 if (!error && netdev_dev->tc) {
3775 if (netdev_dev->tc->ops->tc_destroy) {
3776 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3778 netdev_dev->tc = NULL;
3783 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3784 * kernel to determine what they are. Returns 0 if successful, otherwise a
3785 * positive errno value. */
3787 tc_query_qdisc(const struct netdev *netdev)
3789 struct netdev_dev_linux *netdev_dev =
3790 netdev_dev_linux_cast(netdev_get_dev(netdev));
3791 struct ofpbuf request, *qdisc;
3792 const struct tc_ops *ops;
3793 struct tcmsg *tcmsg;
3797 if (netdev_dev->tc) {
3801 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3802 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3803 * 2.6.35 without that fix backported to it.
3805 * To avoid the OOPS, we must not make a request that would attempt to dump
3806 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3807 * few others. There are a few ways that I can see to do this, but most of
3808 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3809 * technique chosen here is to assume that any non-default qdisc that we
3810 * create will have a class with handle 1:0. The built-in qdiscs only have
3811 * a class with handle 0:0.
3813 * We could check for Linux 2.6.35+ and use a more straightforward method
3815 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3819 tcmsg->tcm_handle = tc_make_handle(1, 0);
3820 tcmsg->tcm_parent = 0;
3822 /* Figure out what tc class to instantiate. */
3823 error = tc_transact(&request, &qdisc);
3827 error = tc_parse_qdisc(qdisc, &kind, NULL);
3829 ops = &tc_ops_other;
3831 ops = tc_lookup_linux_name(kind);
3833 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3834 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3836 ops = &tc_ops_other;
3839 } else if (error == ENOENT) {
3840 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3841 * other entity that doesn't have a handle 1:0. We will assume
3842 * that it's the system default qdisc. */
3843 ops = &tc_ops_default;
3846 /* Who knows? Maybe the device got deleted. */
3847 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3848 netdev_get_name(netdev), strerror(error));
3849 ops = &tc_ops_other;
3852 /* Instantiate it. */
3853 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3854 assert((load_error == 0) == (netdev_dev->tc != NULL));
3855 ofpbuf_delete(qdisc);
3857 return error ? error : load_error;
3860 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3861 approximate the time to transmit packets of various lengths. For an MTU of
3862 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3863 represents two possible packet lengths; for a MTU of 513 through 1024, four
3864 possible lengths; and so on.
3866 Returns, for the specified 'mtu', the number of bits that packet lengths
3867 need to be shifted right to fit within such a 256-entry table. */
3869 tc_calc_cell_log(unsigned int mtu)
3874 mtu = ETH_PAYLOAD_MAX;
3876 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3878 for (cell_log = 0; mtu >= 256; cell_log++) {
3885 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3888 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3890 memset(rate, 0, sizeof *rate);
3891 rate->cell_log = tc_calc_cell_log(mtu);
3892 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3893 /* rate->cell_align = 0; */ /* distro headers. */
3894 rate->mpu = ETH_TOTAL_MIN;
3898 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3899 * attribute of the specified "type".
3901 * See tc_calc_cell_log() above for a description of "rtab"s. */
3903 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3908 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3909 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3910 unsigned packet_size = (i + 1) << rate->cell_log;
3911 if (packet_size < rate->mpu) {
3912 packet_size = rate->mpu;
3914 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3918 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3919 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3920 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3923 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3925 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3926 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3930 /* Utility functions. */
3933 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3935 /* Policy for RTNLGRP_LINK messages.
3937 * There are *many* more fields in these messages, but currently we only
3938 * care about these fields. */
3939 static const struct nl_policy rtnlgrp_link_policy[] = {
3940 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3941 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3942 .min_len = sizeof(struct rtnl_link_stats) },
3945 struct ofpbuf request;
3946 struct ofpbuf *reply;
3947 struct ifinfomsg *ifi;
3948 const struct rtnl_link_stats *rtnl_stats;
3949 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3952 ofpbuf_init(&request, 0);
3953 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3954 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3955 ifi->ifi_family = PF_UNSPEC;
3956 ifi->ifi_index = ifindex;
3957 error = nl_sock_transact(rtnl_sock, &request, &reply);
3958 ofpbuf_uninit(&request);
3963 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3964 rtnlgrp_link_policy,
3965 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3966 ofpbuf_delete(reply);
3970 if (!attrs[IFLA_STATS]) {
3971 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3972 ofpbuf_delete(reply);
3976 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3977 stats->rx_packets = rtnl_stats->rx_packets;
3978 stats->tx_packets = rtnl_stats->tx_packets;
3979 stats->rx_bytes = rtnl_stats->rx_bytes;
3980 stats->tx_bytes = rtnl_stats->tx_bytes;
3981 stats->rx_errors = rtnl_stats->rx_errors;
3982 stats->tx_errors = rtnl_stats->tx_errors;
3983 stats->rx_dropped = rtnl_stats->rx_dropped;
3984 stats->tx_dropped = rtnl_stats->tx_dropped;
3985 stats->multicast = rtnl_stats->multicast;
3986 stats->collisions = rtnl_stats->collisions;
3987 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3988 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3989 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3990 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3991 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3992 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3993 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3994 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3995 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3996 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3997 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3999 ofpbuf_delete(reply);
4005 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4007 static const char fn[] = "/proc/net/dev";
4012 stream = fopen(fn, "r");
4014 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4019 while (fgets(line, sizeof line, stream)) {
4022 #define X64 "%"SCNu64
4025 X64 X64 X64 X64 X64 X64 X64 "%*u"
4026 X64 X64 X64 X64 X64 X64 X64 "%*u",
4032 &stats->rx_fifo_errors,
4033 &stats->rx_frame_errors,
4039 &stats->tx_fifo_errors,
4041 &stats->tx_carrier_errors) != 15) {
4042 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4043 } else if (!strcmp(devname, netdev_name)) {
4044 stats->rx_length_errors = UINT64_MAX;
4045 stats->rx_over_errors = UINT64_MAX;
4046 stats->rx_crc_errors = UINT64_MAX;
4047 stats->rx_missed_errors = UINT64_MAX;
4048 stats->tx_aborted_errors = UINT64_MAX;
4049 stats->tx_heartbeat_errors = UINT64_MAX;
4050 stats->tx_window_errors = UINT64_MAX;
4056 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4062 get_flags(const struct netdev *netdev, int *flags)
4067 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4069 *flags = ifr.ifr_flags;
4074 set_flags(struct netdev *netdev, int flags)
4078 ifr.ifr_flags = flags;
4079 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4084 do_get_ifindex(const char *netdev_name)
4088 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4089 COVERAGE_INC(netdev_get_ifindex);
4090 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4091 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4092 netdev_name, strerror(errno));
4095 return ifr.ifr_ifindex;
4099 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4101 struct netdev_dev_linux *netdev_dev =
4102 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4104 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4105 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4109 netdev_dev->cache_valid |= VALID_IFINDEX;
4110 netdev_dev->ifindex = ifindex;
4112 *ifindexp = netdev_dev->ifindex;
4117 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4122 memset(&ifr, 0, sizeof ifr);
4123 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4124 COVERAGE_INC(netdev_get_hwaddr);
4125 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4126 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4127 netdev_name, strerror(errno));
4130 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4131 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4132 VLOG_WARN("%s device has unknown hardware address family %d",
4133 netdev_name, hwaddr_family);
4135 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4140 set_etheraddr(const char *netdev_name, int hwaddr_family,
4141 const uint8_t mac[ETH_ADDR_LEN])
4145 memset(&ifr, 0, sizeof ifr);
4146 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4147 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4148 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4149 COVERAGE_INC(netdev_set_hwaddr);
4150 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4151 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4152 netdev_name, strerror(errno));
4159 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4160 int cmd, const char *cmd_name)
4164 memset(&ifr, 0, sizeof ifr);
4165 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4166 ifr.ifr_data = (caddr_t) ecmd;
4169 COVERAGE_INC(netdev_ethtool);
4170 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4173 if (errno != EOPNOTSUPP) {
4174 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4175 "failed: %s", cmd_name, name, strerror(errno));
4177 /* The device doesn't support this operation. That's pretty
4178 * common, so there's no point in logging anything. */
4185 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4186 const char *cmd_name)
4188 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4189 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4190 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4198 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4199 int cmd, const char *cmd_name)
4204 ifr.ifr_addr.sa_family = AF_INET;
4205 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4207 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4208 *ip = sin->sin_addr;