2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
96 #define TC_RTAB_SIZE 1024
99 static struct rtnetlink_notifier netdev_linux_cache_notifier;
100 static int cache_notifier_refcount;
103 VALID_IFINDEX = 1 << 0,
104 VALID_ETHERADDR = 1 << 1,
108 VALID_CARRIER = 1 << 5,
109 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
110 VALID_POLICING = 1 << 7,
111 VALID_HAVE_VPORT_STATS = 1 << 8
119 /* Traffic control. */
121 /* An instance of a traffic control class. Always associated with a particular
124 * Each TC implementation subclasses this with whatever additional data it
127 const struct tc_ops *ops;
128 struct hmap queues; /* Contains "struct tc_queue"s.
129 * Read by generic TC layer.
130 * Written only by TC implementation. */
133 /* One traffic control queue.
135 * Each TC implementation subclasses this with whatever additional data it
138 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
139 unsigned int queue_id; /* OpenFlow queue ID. */
142 /* A particular kind of traffic control. Each implementation generally maps to
143 * one particular Linux qdisc class.
145 * The functions below return 0 if successful or a positive errno value on
146 * failure, except where otherwise noted. All of them must be provided, except
147 * where otherwise noted. */
149 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
150 * This is null for tc_ops_default and tc_ops_other, for which there are no
151 * appropriate values. */
152 const char *linux_name;
154 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
155 const char *ovs_name;
157 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
158 * queues. The queues are numbered 0 through n_queues - 1. */
159 unsigned int n_queues;
161 /* Called to install this TC class on 'netdev'. The implementation should
162 * make the Netlink calls required to set up 'netdev' with the right qdisc
163 * and configure it according to 'details'. The implementation may assume
164 * that the current qdisc is the default; that is, there is no need for it
165 * to delete the current qdisc before installing itself.
167 * The contents of 'details' should be documented as valid for 'ovs_name'
168 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
169 * (which is built as ovs-vswitchd.conf.db(8)).
171 * This function must return 0 if and only if it sets 'netdev->tc' to an
172 * initialized 'struct tc'.
174 * (This function is null for tc_ops_other, which cannot be installed. For
175 * other TC classes it should always be nonnull.) */
176 int (*tc_install)(struct netdev *netdev, const struct shash *details);
178 /* Called when the netdev code determines (through a Netlink query) that
179 * this TC class's qdisc is installed on 'netdev', but we didn't install
180 * it ourselves and so don't know any of the details.
182 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
183 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
184 * implementation should parse the other attributes of 'nlmsg' as
185 * necessary to determine its configuration. If necessary it should also
186 * use Netlink queries to determine the configuration of queues on
189 * This function must return 0 if and only if it sets 'netdev->tc' to an
190 * initialized 'struct tc'. */
191 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
193 /* Destroys the data structures allocated by the implementation as part of
194 * 'tc'. (This includes destroying 'tc->queues' by calling
197 * The implementation should not need to perform any Netlink calls. If
198 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
199 * (But it may not be desirable.)
201 * This function may be null if 'tc' is trivial. */
202 void (*tc_destroy)(struct tc *tc);
204 /* Retrieves details of 'netdev->tc' configuration into 'details'.
206 * The implementation should not need to perform any Netlink calls, because
207 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
208 * cached the configuration.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
212 * (which is built as ovs-vswitchd.conf.db(8)).
214 * This function may be null if 'tc' is not configurable.
216 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
218 /* Reconfigures 'netdev->tc' according to 'details', performing any
219 * required Netlink calls to complete the reconfiguration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_set)(struct netdev *, const struct shash *details);
229 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
230 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "Queue" table in
234 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
236 * The implementation should not need to perform any Netlink calls, because
237 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
238 * cached the queue configuration.
240 * This function may be null if 'tc' does not have queues ('n_queues' is
242 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
243 struct shash *details);
245 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
246 * 'details', perfoming any required Netlink calls to complete the
247 * reconfiguration. The caller ensures that 'queue_id' is less than
250 * The contents of 'details' should be documented as valid for 'ovs_name'
251 * in the "other_config" column in the "Queue" table in
252 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
254 * This function may be null if 'tc' does not have queues or its queues are
255 * not configurable. */
256 int (*class_set)(struct netdev *, unsigned int queue_id,
257 const struct shash *details);
259 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
260 * tc_queue's within 'netdev->tc->queues'.
262 * This function may be null if 'tc' does not have queues or its queues
263 * cannot be deleted. */
264 int (*class_delete)(struct netdev *, struct tc_queue *queue);
266 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
267 * 'struct tc_queue's within 'netdev->tc->queues'.
269 * On success, initializes '*stats'.
271 * This function may be null if 'tc' does not have queues or if it cannot
272 * report queue statistics. */
273 int (*class_get_stats)(const struct netdev *netdev,
274 const struct tc_queue *queue,
275 struct netdev_queue_stats *stats);
277 /* Extracts queue stats from 'nlmsg', which is a response to a
278 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_dump_stats)(const struct netdev *netdev,
283 const struct ofpbuf *nlmsg,
284 netdev_dump_queue_stats_cb *cb, void *aux);
288 tc_init(struct tc *tc, const struct tc_ops *ops)
291 hmap_init(&tc->queues);
295 tc_destroy(struct tc *tc)
297 hmap_destroy(&tc->queues);
300 static const struct tc_ops tc_ops_htb;
301 static const struct tc_ops tc_ops_hfsc;
302 static const struct tc_ops tc_ops_default;
303 static const struct tc_ops tc_ops_other;
305 static const struct tc_ops *tcs[] = {
306 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
307 &tc_ops_hfsc, /* Hierarchical fair service curve. */
308 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
309 &tc_ops_other, /* Some other qdisc. */
313 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
314 static unsigned int tc_get_major(unsigned int handle);
315 static unsigned int tc_get_minor(unsigned int handle);
317 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
318 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
319 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
321 static struct tcmsg *tc_make_request(const struct netdev *, int type,
322 unsigned int flags, struct ofpbuf *);
323 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
325 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
326 struct nlattr **options);
327 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
328 struct nlattr **options,
329 struct netdev_queue_stats *);
330 static int tc_query_class(const struct netdev *,
331 unsigned int handle, unsigned int parent,
332 struct ofpbuf **replyp);
333 static int tc_delete_class(const struct netdev *, unsigned int handle);
335 static int tc_del_qdisc(struct netdev *netdev);
336 static int tc_query_qdisc(const struct netdev *netdev);
338 static int tc_calc_cell_log(unsigned int mtu);
339 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
340 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
341 const struct tc_ratespec *rate);
342 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
344 struct netdev_dev_linux {
345 struct netdev_dev netdev_dev;
347 struct shash_node *shash_node;
348 unsigned int cache_valid;
350 bool miimon; /* Link status of last poll. */
351 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
352 struct timer miimon_timer;
354 /* The following are figured out "on demand" only. They are only valid
355 * when the corresponding VALID_* bit in 'cache_valid' is set. */
357 uint8_t etheraddr[ETH_ADDR_LEN];
358 struct in_addr address, netmask;
362 bool is_internal; /* Is this an openvswitch internal device? */
363 bool is_tap; /* Is this a tuntap device? */
364 uint32_t kbits_rate; /* Policing data. */
365 uint32_t kbits_burst;
366 bool have_vport_stats;
370 struct tap_state tap;
374 struct netdev_linux {
375 struct netdev netdev;
379 /* Sockets used for ioctl operations. */
380 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
382 /* A Netlink routing socket that is not subscribed to any multicast groups. */
383 static struct nl_sock *rtnl_sock;
385 struct netdev_linux_notifier {
386 struct netdev_notifier notifier;
390 static struct shash netdev_linux_notifiers =
391 SHASH_INITIALIZER(&netdev_linux_notifiers);
392 static struct rtnetlink_notifier netdev_linux_poll_notifier;
394 /* This is set pretty low because we probably won't learn anything from the
395 * additional log messages. */
396 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
398 static int netdev_linux_init(void);
400 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
401 int cmd, const char *cmd_name);
402 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
403 const char *cmd_name);
404 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
405 int cmd, const char *cmd_name);
406 static int get_flags(const struct netdev *, int *flagsp);
407 static int set_flags(struct netdev *, int flags);
408 static int do_get_ifindex(const char *netdev_name);
409 static int get_ifindex(const struct netdev *, int *ifindexp);
410 static int do_set_addr(struct netdev *netdev,
411 int ioctl_nr, const char *ioctl_name,
412 struct in_addr addr);
413 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
414 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
415 const uint8_t[ETH_ADDR_LEN]);
416 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
417 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
418 static int af_packet_sock(void);
419 static void poll_notify(struct list *);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_notifier_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_notifier_wait();
483 netdev_linux_miimon_wait();
487 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
488 void *aux OVS_UNUSED)
490 struct netdev_dev_linux *dev;
492 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
494 const struct netdev_class *netdev_class =
495 netdev_dev_get_class(base_dev);
497 if (is_netdev_linux_class(netdev_class)) {
498 dev = netdev_dev_linux_cast(base_dev);
499 dev->cache_valid = 0;
503 struct shash device_shash;
504 struct shash_node *node;
506 shash_init(&device_shash);
507 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
508 SHASH_FOR_EACH (node, &device_shash) {
510 dev->cache_valid = 0;
512 shash_destroy(&device_shash);
516 /* Creates system and internal devices. */
518 netdev_linux_create(const struct netdev_class *class,
519 const char *name, const struct shash *args,
520 struct netdev_dev **netdev_devp)
522 struct netdev_dev_linux *netdev_dev;
525 if (!shash_is_empty(args)) {
526 VLOG_WARN("%s: arguments for %s devices should be empty",
530 if (!cache_notifier_refcount) {
531 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
532 netdev_linux_cache_cb, NULL);
537 cache_notifier_refcount++;
539 netdev_dev = xzalloc(sizeof *netdev_dev);
540 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
542 *netdev_devp = &netdev_dev->netdev_dev;
546 /* For most types of netdevs we open the device for each call of
547 * netdev_open(). However, this is not the case with tap devices,
548 * since it is only possible to open the device once. In this
549 * situation we share a single file descriptor, and consequently
550 * buffers, across all readers. Therefore once data is read it will
551 * be unavailable to other reads for tap devices. */
553 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
554 const char *name, const struct shash *args,
555 struct netdev_dev **netdev_devp)
557 struct netdev_dev_linux *netdev_dev;
558 struct tap_state *state;
559 static const char tap_dev[] = "/dev/net/tun";
563 if (!shash_is_empty(args)) {
564 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
567 netdev_dev = xzalloc(sizeof *netdev_dev);
568 state = &netdev_dev->state.tap;
570 /* Open tap device. */
571 state->fd = open(tap_dev, O_RDWR);
574 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
578 /* Create tap device. */
579 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
580 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
581 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
582 VLOG_WARN("%s: creating tap device failed: %s", name,
588 /* Make non-blocking. */
589 error = set_nonblocking(state->fd);
594 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
595 *netdev_devp = &netdev_dev->netdev_dev;
604 destroy_tap(struct netdev_dev_linux *netdev_dev)
606 struct tap_state *state = &netdev_dev->state.tap;
608 if (state->fd >= 0) {
613 /* Destroys the netdev device 'netdev_dev_'. */
615 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
617 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
618 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
620 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
621 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
624 if (class == &netdev_linux_class || class == &netdev_internal_class) {
625 cache_notifier_refcount--;
627 if (!cache_notifier_refcount) {
628 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
630 } else if (class == &netdev_tap_class) {
631 destroy_tap(netdev_dev);
640 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
641 struct netdev **netdevp)
643 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
644 struct netdev_linux *netdev;
645 enum netdev_flags flags;
648 /* Allocate network device. */
649 netdev = xzalloc(sizeof *netdev);
651 netdev_init(&netdev->netdev, netdev_dev_);
653 /* Verify that the device really exists, by attempting to read its flags.
654 * (The flags might be cached, in which case this won't actually do an
657 * Don't do this for "internal" netdevs, though, because those have to be
658 * created as netdev objects before they exist in the kernel, because
659 * creating them in the kernel happens by passing a netdev object to
660 * dpif_port_add(). */
661 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
662 error = netdev_get_flags(&netdev->netdev, &flags);
663 if (error == ENODEV) {
668 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
669 !netdev_dev->state.tap.opened) {
671 /* We assume that the first user of the tap device is the primary user
672 * and give them the tap FD. Subsequent users probably just expect
673 * this to be a system device so open it normally to avoid send/receive
674 * directions appearing to be reversed. */
675 netdev->fd = netdev_dev->state.tap.fd;
676 netdev_dev->state.tap.opened = true;
677 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
678 struct sockaddr_ll sll;
682 /* Create file descriptor. */
683 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
684 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
686 netdev->fd = socket(PF_PACKET, SOCK_RAW,
687 (OVS_FORCE int) htons(protocol));
688 if (netdev->fd < 0) {
693 /* Set non-blocking mode. */
694 error = set_nonblocking(netdev->fd);
699 /* Get ethernet device index. */
700 error = get_ifindex(&netdev->netdev, &ifindex);
705 /* Bind to specific ethernet device. */
706 memset(&sll, 0, sizeof sll);
707 sll.sll_family = AF_PACKET;
708 sll.sll_ifindex = ifindex;
710 (struct sockaddr *) &sll, sizeof sll) < 0) {
712 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
717 /* Between the socket() and bind() calls above, the socket receives all
718 * packets of the requested type on all system interfaces. We do not
719 * want to receive that data, but there is no way to avoid it. So we
720 * must now drain out the receive queue. */
721 error = drain_rcvbuf(netdev->fd);
727 *netdevp = &netdev->netdev;
731 netdev_uninit(&netdev->netdev, true);
735 /* Closes and destroys 'netdev'. */
737 netdev_linux_close(struct netdev *netdev_)
739 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
741 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
747 /* Initializes 'sset' with a list of the names of all known network devices. */
749 netdev_linux_enumerate(struct sset *sset)
751 struct if_nameindex *names;
753 names = if_nameindex();
757 for (i = 0; names[i].if_name != NULL; i++) {
758 sset_add(sset, names[i].if_name);
760 if_freenameindex(names);
763 VLOG_WARN("could not obtain list of network device names: %s",
770 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
772 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774 if (netdev->fd < 0) {
775 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
780 ssize_t retval = read(netdev->fd, data, size);
783 } else if (errno != EINTR) {
784 if (errno != EAGAIN) {
785 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
786 strerror(errno), netdev_get_name(netdev_));
793 /* Registers with the poll loop to wake up from the next call to poll_block()
794 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
796 netdev_linux_recv_wait(struct netdev *netdev_)
798 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799 if (netdev->fd >= 0) {
800 poll_fd_wait(netdev->fd, POLLIN);
804 /* Discards all packets waiting to be received from 'netdev'. */
806 netdev_linux_drain(struct netdev *netdev_)
808 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
809 if (netdev->fd < 0) {
811 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
813 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
814 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
818 drain_fd(netdev->fd, ifr.ifr_qlen);
821 return drain_rcvbuf(netdev->fd);
825 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
826 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
827 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
828 * the packet is too big or too small to transmit on the device.
830 * The caller retains ownership of 'buffer' in all cases.
832 * The kernel maintains a packet transmission queue, so the caller is not
833 * expected to do additional queuing of packets. */
835 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
841 if (netdev->fd < 0) {
842 /* Use our AF_PACKET socket to send to this device. */
843 struct sockaddr_ll sll;
850 sock = af_packet_sock();
855 error = get_ifindex(netdev_, &ifindex);
860 /* We don't bother setting most fields in sockaddr_ll because the
861 * kernel ignores them for SOCK_RAW. */
862 memset(&sll, 0, sizeof sll);
863 sll.sll_family = AF_PACKET;
864 sll.sll_ifindex = ifindex;
866 iov.iov_base = (void *) data;
870 msg.msg_namelen = sizeof sll;
873 msg.msg_control = NULL;
874 msg.msg_controllen = 0;
877 retval = sendmsg(sock, &msg, 0);
879 /* Use the netdev's own fd to send to this device. This is
880 * essential for tap devices, because packets sent to a tap device
881 * with an AF_PACKET socket will loop back to be *received* again
882 * on the tap device. */
883 retval = write(netdev->fd, data, size);
887 /* The Linux AF_PACKET implementation never blocks waiting for room
888 * for packets, instead returning ENOBUFS. Translate this into
889 * EAGAIN for the caller. */
890 if (errno == ENOBUFS) {
892 } else if (errno == EINTR) {
894 } else if (errno != EAGAIN) {
895 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
896 netdev_get_name(netdev_), strerror(errno));
899 } else if (retval != size) {
900 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
901 "%zu) on %s", retval, size, netdev_get_name(netdev_));
909 /* Registers with the poll loop to wake up from the next call to poll_block()
910 * when the packet transmission queue has sufficient room to transmit a packet
911 * with netdev_send().
913 * The kernel maintains a packet transmission queue, so the client is not
914 * expected to do additional queuing of packets. Thus, this function is
915 * unlikely to ever be used. It is included for completeness. */
917 netdev_linux_send_wait(struct netdev *netdev_)
919 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
920 if (netdev->fd < 0) {
922 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
923 poll_fd_wait(netdev->fd, POLLOUT);
925 /* TAP device always accepts packets.*/
926 poll_immediate_wake();
930 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
931 * otherwise a positive errno value. */
933 netdev_linux_set_etheraddr(struct netdev *netdev_,
934 const uint8_t mac[ETH_ADDR_LEN])
936 struct netdev_dev_linux *netdev_dev =
937 netdev_dev_linux_cast(netdev_get_dev(netdev_));
940 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
941 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
942 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
944 netdev_dev->cache_valid |= VALID_ETHERADDR;
945 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
953 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
954 * free the returned buffer. */
956 netdev_linux_get_etheraddr(const struct netdev *netdev_,
957 uint8_t mac[ETH_ADDR_LEN])
959 struct netdev_dev_linux *netdev_dev =
960 netdev_dev_linux_cast(netdev_get_dev(netdev_));
961 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
962 int error = get_etheraddr(netdev_get_name(netdev_),
963 netdev_dev->etheraddr);
967 netdev_dev->cache_valid |= VALID_ETHERADDR;
969 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
973 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
974 * in bytes, not including the hardware header; thus, this is typically 1500
975 * bytes for Ethernet devices. */
977 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
979 struct netdev_dev_linux *netdev_dev =
980 netdev_dev_linux_cast(netdev_get_dev(netdev_));
981 if (!(netdev_dev->cache_valid & VALID_MTU)) {
985 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
986 SIOCGIFMTU, "SIOCGIFMTU");
990 netdev_dev->mtu = ifr.ifr_mtu;
991 netdev_dev->cache_valid |= VALID_MTU;
993 *mtup = netdev_dev->mtu;
997 /* Returns the ifindex of 'netdev', if successful, as a positive number.
998 * On failure, returns a negative errno value. */
1000 netdev_linux_get_ifindex(const struct netdev *netdev)
1004 error = get_ifindex(netdev, &ifindex);
1005 return error ? -error : ifindex;
1009 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1011 struct netdev_dev_linux *netdev_dev =
1012 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1017 if (netdev_dev->miimon_interval > 0) {
1018 *carrier = netdev_dev->miimon;
1022 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1026 fn = xasprintf("/sys/class/net/%s/carrier",
1027 netdev_get_name(netdev_));
1028 fd = open(fn, O_RDONLY);
1031 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1035 retval = read(fd, line, sizeof line);
1038 if (error == EINVAL) {
1039 /* This is the normal return value when we try to check carrier
1040 * if the network device is not up. */
1042 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1045 } else if (retval == 0) {
1047 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1051 if (line[0] != '0' && line[0] != '1') {
1053 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1057 netdev_dev->carrier = line[0] != '0';
1058 netdev_dev->cache_valid |= VALID_CARRIER;
1060 *carrier = netdev_dev->carrier;
1072 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1073 struct mii_ioctl_data *data)
1078 memset(&ifr, 0, sizeof ifr);
1079 memcpy(&ifr.ifr_data, data, sizeof *data);
1080 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1081 memcpy(data, &ifr.ifr_data, sizeof *data);
1087 netdev_linux_get_miimon(const char *name, bool *miimon)
1089 struct mii_ioctl_data data;
1094 memset(&data, 0, sizeof data);
1095 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1097 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1098 data.reg_num = MII_BMSR;
1099 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1103 *miimon = !!(data.val_out & BMSR_LSTATUS);
1105 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1108 struct ethtool_cmd ecmd;
1110 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1113 memset(&ecmd, 0, sizeof ecmd);
1114 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1117 struct ethtool_value eval;
1119 memcpy(&eval, &ecmd, sizeof eval);
1120 *miimon = !!eval.data;
1122 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1130 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1131 long long int interval)
1133 struct netdev_dev_linux *netdev_dev;
1135 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1137 interval = interval > 0 ? MAX(interval, 100) : 0;
1138 if (netdev_dev->miimon_interval != interval) {
1139 netdev_dev->miimon_interval = interval;
1140 timer_set_expired(&netdev_dev->miimon_timer);
1147 netdev_linux_miimon_run(void)
1149 struct shash device_shash;
1150 struct shash_node *node;
1152 shash_init(&device_shash);
1153 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1154 SHASH_FOR_EACH (node, &device_shash) {
1155 struct netdev_dev_linux *dev = node->data;
1158 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1162 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1163 if (miimon != dev->miimon) {
1166 dev->miimon = miimon;
1167 list = shash_find_data(&netdev_linux_notifiers,
1168 dev->netdev_dev.name);
1174 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1177 shash_destroy(&device_shash);
1181 netdev_linux_miimon_wait(void)
1183 struct shash device_shash;
1184 struct shash_node *node;
1186 shash_init(&device_shash);
1187 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1188 SHASH_FOR_EACH (node, &device_shash) {
1189 struct netdev_dev_linux *dev = node->data;
1191 if (dev->miimon_interval > 0) {
1192 timer_wait(&dev->miimon_timer);
1195 shash_destroy(&device_shash);
1198 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1199 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1202 check_for_working_netlink_stats(void)
1204 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1205 * preferable, so if that works, we'll use it. */
1206 int ifindex = do_get_ifindex("lo");
1208 VLOG_WARN("failed to get ifindex for lo, "
1209 "obtaining netdev stats from proc");
1212 struct netdev_stats stats;
1213 int error = get_stats_via_netlink(ifindex, &stats);
1215 VLOG_DBG("obtaining netdev stats via rtnetlink");
1218 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1219 "via proc (you are probably running a pre-2.6.19 "
1220 "kernel)", strerror(error));
1226 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1228 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1230 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1231 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1232 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1234 netdev_dev->is_tap = !strcmp(type, "tap");
1235 netdev_dev->is_internal = (!netdev_dev->is_tap
1236 && dpif_linux_is_internal_device(name));
1237 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1242 swap_uint64(uint64_t *a, uint64_t *b)
1249 /* Retrieves current device stats for 'netdev'. */
1251 netdev_linux_get_stats(const struct netdev *netdev_,
1252 struct netdev_stats *stats)
1254 struct netdev_dev_linux *netdev_dev =
1255 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1256 static int use_netlink_stats = -1;
1259 if (netdev_dev->have_vport_stats ||
1260 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1262 error = netdev_vport_get_stats(netdev_, stats);
1263 netdev_dev->have_vport_stats = !error;
1264 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1267 if (!netdev_dev->have_vport_stats) {
1268 if (use_netlink_stats < 0) {
1269 use_netlink_stats = check_for_working_netlink_stats();
1271 if (use_netlink_stats) {
1274 error = get_ifindex(netdev_, &ifindex);
1276 error = get_stats_via_netlink(ifindex, stats);
1279 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1283 /* If this port is an internal port then the transmit and receive stats
1284 * will appear to be swapped relative to the other ports since we are the
1285 * one sending the data, not a remote computer. For consistency, we swap
1286 * them back here. This does not apply if we are getting stats from the
1287 * vport layer because it always tracks stats from the perspective of the
1289 netdev_linux_update_is_pseudo(netdev_dev);
1290 if (!error && !netdev_dev->have_vport_stats &&
1291 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1292 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1293 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1294 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1295 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1296 stats->rx_length_errors = 0;
1297 stats->rx_over_errors = 0;
1298 stats->rx_crc_errors = 0;
1299 stats->rx_frame_errors = 0;
1300 stats->rx_fifo_errors = 0;
1301 stats->rx_missed_errors = 0;
1302 stats->tx_aborted_errors = 0;
1303 stats->tx_carrier_errors = 0;
1304 stats->tx_fifo_errors = 0;
1305 stats->tx_heartbeat_errors = 0;
1306 stats->tx_window_errors = 0;
1312 /* Stores the features supported by 'netdev' into each of '*current',
1313 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1314 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1315 * successful, otherwise a positive errno value. */
1317 netdev_linux_get_features(const struct netdev *netdev,
1318 uint32_t *current, uint32_t *advertised,
1319 uint32_t *supported, uint32_t *peer)
1321 struct ethtool_cmd ecmd;
1324 memset(&ecmd, 0, sizeof ecmd);
1325 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1326 ETHTOOL_GSET, "ETHTOOL_GSET");
1331 /* Supported features. */
1333 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1334 *supported |= OFPPF_10MB_HD;
1336 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1337 *supported |= OFPPF_10MB_FD;
1339 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1340 *supported |= OFPPF_100MB_HD;
1342 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1343 *supported |= OFPPF_100MB_FD;
1345 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1346 *supported |= OFPPF_1GB_HD;
1348 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1349 *supported |= OFPPF_1GB_FD;
1351 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1352 *supported |= OFPPF_10GB_FD;
1354 if (ecmd.supported & SUPPORTED_TP) {
1355 *supported |= OFPPF_COPPER;
1357 if (ecmd.supported & SUPPORTED_FIBRE) {
1358 *supported |= OFPPF_FIBER;
1360 if (ecmd.supported & SUPPORTED_Autoneg) {
1361 *supported |= OFPPF_AUTONEG;
1363 if (ecmd.supported & SUPPORTED_Pause) {
1364 *supported |= OFPPF_PAUSE;
1366 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1367 *supported |= OFPPF_PAUSE_ASYM;
1370 /* Advertised features. */
1372 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1373 *advertised |= OFPPF_10MB_HD;
1375 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1376 *advertised |= OFPPF_10MB_FD;
1378 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1379 *advertised |= OFPPF_100MB_HD;
1381 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1382 *advertised |= OFPPF_100MB_FD;
1384 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1385 *advertised |= OFPPF_1GB_HD;
1387 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1388 *advertised |= OFPPF_1GB_FD;
1390 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1391 *advertised |= OFPPF_10GB_FD;
1393 if (ecmd.advertising & ADVERTISED_TP) {
1394 *advertised |= OFPPF_COPPER;
1396 if (ecmd.advertising & ADVERTISED_FIBRE) {
1397 *advertised |= OFPPF_FIBER;
1399 if (ecmd.advertising & ADVERTISED_Autoneg) {
1400 *advertised |= OFPPF_AUTONEG;
1402 if (ecmd.advertising & ADVERTISED_Pause) {
1403 *advertised |= OFPPF_PAUSE;
1405 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1406 *advertised |= OFPPF_PAUSE_ASYM;
1409 /* Current settings. */
1410 if (ecmd.speed == SPEED_10) {
1411 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1412 } else if (ecmd.speed == SPEED_100) {
1413 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1414 } else if (ecmd.speed == SPEED_1000) {
1415 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1416 } else if (ecmd.speed == SPEED_10000) {
1417 *current = OFPPF_10GB_FD;
1422 if (ecmd.port == PORT_TP) {
1423 *current |= OFPPF_COPPER;
1424 } else if (ecmd.port == PORT_FIBRE) {
1425 *current |= OFPPF_FIBER;
1429 *current |= OFPPF_AUTONEG;
1432 /* Peer advertisements. */
1433 *peer = 0; /* XXX */
1438 /* Set the features advertised by 'netdev' to 'advertise'. */
1440 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1442 struct ethtool_cmd ecmd;
1445 memset(&ecmd, 0, sizeof ecmd);
1446 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1447 ETHTOOL_GSET, "ETHTOOL_GSET");
1452 ecmd.advertising = 0;
1453 if (advertise & OFPPF_10MB_HD) {
1454 ecmd.advertising |= ADVERTISED_10baseT_Half;
1456 if (advertise & OFPPF_10MB_FD) {
1457 ecmd.advertising |= ADVERTISED_10baseT_Full;
1459 if (advertise & OFPPF_100MB_HD) {
1460 ecmd.advertising |= ADVERTISED_100baseT_Half;
1462 if (advertise & OFPPF_100MB_FD) {
1463 ecmd.advertising |= ADVERTISED_100baseT_Full;
1465 if (advertise & OFPPF_1GB_HD) {
1466 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1468 if (advertise & OFPPF_1GB_FD) {
1469 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1471 if (advertise & OFPPF_10GB_FD) {
1472 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1474 if (advertise & OFPPF_COPPER) {
1475 ecmd.advertising |= ADVERTISED_TP;
1477 if (advertise & OFPPF_FIBER) {
1478 ecmd.advertising |= ADVERTISED_FIBRE;
1480 if (advertise & OFPPF_AUTONEG) {
1481 ecmd.advertising |= ADVERTISED_Autoneg;
1483 if (advertise & OFPPF_PAUSE) {
1484 ecmd.advertising |= ADVERTISED_Pause;
1486 if (advertise & OFPPF_PAUSE_ASYM) {
1487 ecmd.advertising |= ADVERTISED_Asym_Pause;
1489 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1490 ETHTOOL_SSET, "ETHTOOL_SSET");
1493 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1494 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1495 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1496 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1497 * sets '*vlan_vid' to -1. */
1499 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1501 const char *netdev_name = netdev_get_name(netdev);
1502 struct ds line = DS_EMPTY_INITIALIZER;
1503 FILE *stream = NULL;
1507 COVERAGE_INC(netdev_get_vlan_vid);
1508 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1509 stream = fopen(fn, "r");
1515 if (ds_get_line(&line, stream)) {
1516 if (ferror(stream)) {
1518 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1521 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1526 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1528 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1529 fn, ds_cstr(&line));
1547 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1548 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1550 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1551 * positive errno value.
1553 * This function is equivalent to running
1554 * /sbin/tc qdisc del dev %s handle ffff: ingress
1555 * but it is much, much faster.
1558 netdev_linux_remove_policing(struct netdev *netdev)
1560 struct netdev_dev_linux *netdev_dev =
1561 netdev_dev_linux_cast(netdev_get_dev(netdev));
1562 const char *netdev_name = netdev_get_name(netdev);
1564 struct ofpbuf request;
1565 struct tcmsg *tcmsg;
1568 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1572 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1573 tcmsg->tcm_parent = TC_H_INGRESS;
1574 nl_msg_put_string(&request, TCA_KIND, "ingress");
1575 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1577 error = tc_transact(&request, NULL);
1578 if (error && error != ENOENT && error != EINVAL) {
1579 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1580 netdev_name, strerror(error));
1584 netdev_dev->kbits_rate = 0;
1585 netdev_dev->kbits_burst = 0;
1586 netdev_dev->cache_valid |= VALID_POLICING;
1590 /* Attempts to set input rate limiting (policing) policy. */
1592 netdev_linux_set_policing(struct netdev *netdev,
1593 uint32_t kbits_rate, uint32_t kbits_burst)
1595 struct netdev_dev_linux *netdev_dev =
1596 netdev_dev_linux_cast(netdev_get_dev(netdev));
1597 const char *netdev_name = netdev_get_name(netdev);
1600 COVERAGE_INC(netdev_set_policing);
1602 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1603 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1604 : kbits_burst); /* Stick with user-specified value. */
1606 if (netdev_dev->cache_valid & VALID_POLICING
1607 && netdev_dev->kbits_rate == kbits_rate
1608 && netdev_dev->kbits_burst == kbits_burst) {
1609 /* Assume that settings haven't changed since we last set them. */
1613 netdev_linux_remove_policing(netdev);
1615 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1616 if (system(command) != 0) {
1617 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1621 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1622 kbits_rate, kbits_burst);
1623 if (system(command) != 0) {
1624 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1629 netdev_dev->kbits_rate = kbits_rate;
1630 netdev_dev->kbits_burst = kbits_burst;
1631 netdev_dev->cache_valid |= VALID_POLICING;
1638 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1641 const struct tc_ops **opsp;
1643 for (opsp = tcs; *opsp != NULL; opsp++) {
1644 const struct tc_ops *ops = *opsp;
1645 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1646 sset_add(types, ops->ovs_name);
1652 static const struct tc_ops *
1653 tc_lookup_ovs_name(const char *name)
1655 const struct tc_ops **opsp;
1657 for (opsp = tcs; *opsp != NULL; opsp++) {
1658 const struct tc_ops *ops = *opsp;
1659 if (!strcmp(name, ops->ovs_name)) {
1666 static const struct tc_ops *
1667 tc_lookup_linux_name(const char *name)
1669 const struct tc_ops **opsp;
1671 for (opsp = tcs; *opsp != NULL; opsp++) {
1672 const struct tc_ops *ops = *opsp;
1673 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1680 static struct tc_queue *
1681 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1684 struct netdev_dev_linux *netdev_dev =
1685 netdev_dev_linux_cast(netdev_get_dev(netdev));
1686 struct tc_queue *queue;
1688 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1689 if (queue->queue_id == queue_id) {
1696 static struct tc_queue *
1697 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1699 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1703 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1705 struct netdev_qos_capabilities *caps)
1707 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1711 caps->n_queues = ops->n_queues;
1716 netdev_linux_get_qos(const struct netdev *netdev,
1717 const char **typep, struct shash *details)
1719 struct netdev_dev_linux *netdev_dev =
1720 netdev_dev_linux_cast(netdev_get_dev(netdev));
1723 error = tc_query_qdisc(netdev);
1728 *typep = netdev_dev->tc->ops->ovs_name;
1729 return (netdev_dev->tc->ops->qdisc_get
1730 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1735 netdev_linux_set_qos(struct netdev *netdev,
1736 const char *type, const struct shash *details)
1738 struct netdev_dev_linux *netdev_dev =
1739 netdev_dev_linux_cast(netdev_get_dev(netdev));
1740 const struct tc_ops *new_ops;
1743 new_ops = tc_lookup_ovs_name(type);
1744 if (!new_ops || !new_ops->tc_install) {
1748 error = tc_query_qdisc(netdev);
1753 if (new_ops == netdev_dev->tc->ops) {
1754 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1756 /* Delete existing qdisc. */
1757 error = tc_del_qdisc(netdev);
1761 assert(netdev_dev->tc == NULL);
1763 /* Install new qdisc. */
1764 error = new_ops->tc_install(netdev, details);
1765 assert((error == 0) == (netdev_dev->tc != NULL));
1772 netdev_linux_get_queue(const struct netdev *netdev,
1773 unsigned int queue_id, struct shash *details)
1775 struct netdev_dev_linux *netdev_dev =
1776 netdev_dev_linux_cast(netdev_get_dev(netdev));
1779 error = tc_query_qdisc(netdev);
1783 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1785 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1791 netdev_linux_set_queue(struct netdev *netdev,
1792 unsigned int queue_id, const struct shash *details)
1794 struct netdev_dev_linux *netdev_dev =
1795 netdev_dev_linux_cast(netdev_get_dev(netdev));
1798 error = tc_query_qdisc(netdev);
1801 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1802 || !netdev_dev->tc->ops->class_set) {
1806 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1810 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1812 struct netdev_dev_linux *netdev_dev =
1813 netdev_dev_linux_cast(netdev_get_dev(netdev));
1816 error = tc_query_qdisc(netdev);
1819 } else if (!netdev_dev->tc->ops->class_delete) {
1822 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1824 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1830 netdev_linux_get_queue_stats(const struct netdev *netdev,
1831 unsigned int queue_id,
1832 struct netdev_queue_stats *stats)
1834 struct netdev_dev_linux *netdev_dev =
1835 netdev_dev_linux_cast(netdev_get_dev(netdev));
1838 error = tc_query_qdisc(netdev);
1841 } else if (!netdev_dev->tc->ops->class_get_stats) {
1844 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1846 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1852 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1854 struct ofpbuf request;
1855 struct tcmsg *tcmsg;
1857 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1861 tcmsg->tcm_parent = 0;
1862 nl_dump_start(dump, rtnl_sock, &request);
1863 ofpbuf_uninit(&request);
1868 netdev_linux_dump_queues(const struct netdev *netdev,
1869 netdev_dump_queues_cb *cb, void *aux)
1871 struct netdev_dev_linux *netdev_dev =
1872 netdev_dev_linux_cast(netdev_get_dev(netdev));
1873 struct tc_queue *queue;
1874 struct shash details;
1878 error = tc_query_qdisc(netdev);
1881 } else if (!netdev_dev->tc->ops->class_get) {
1886 shash_init(&details);
1887 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1888 shash_clear(&details);
1890 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1892 (*cb)(queue->queue_id, &details, aux);
1897 shash_destroy(&details);
1903 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1904 netdev_dump_queue_stats_cb *cb, void *aux)
1906 struct netdev_dev_linux *netdev_dev =
1907 netdev_dev_linux_cast(netdev_get_dev(netdev));
1908 struct nl_dump dump;
1913 error = tc_query_qdisc(netdev);
1916 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1921 if (!start_queue_dump(netdev, &dump)) {
1924 while (nl_dump_next(&dump, &msg)) {
1925 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1931 error = nl_dump_done(&dump);
1932 return error ? error : last_error;
1936 netdev_linux_get_in4(const struct netdev *netdev_,
1937 struct in_addr *address, struct in_addr *netmask)
1939 struct netdev_dev_linux *netdev_dev =
1940 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1942 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1945 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1946 SIOCGIFADDR, "SIOCGIFADDR");
1951 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1952 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1957 netdev_dev->cache_valid |= VALID_IN4;
1959 *address = netdev_dev->address;
1960 *netmask = netdev_dev->netmask;
1961 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1965 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1966 struct in_addr netmask)
1968 struct netdev_dev_linux *netdev_dev =
1969 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1972 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1974 netdev_dev->cache_valid |= VALID_IN4;
1975 netdev_dev->address = address;
1976 netdev_dev->netmask = netmask;
1977 if (address.s_addr != INADDR_ANY) {
1978 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1979 "SIOCSIFNETMASK", netmask);
1986 parse_if_inet6_line(const char *line,
1987 struct in6_addr *in6, char ifname[16 + 1])
1989 uint8_t *s6 = in6->s6_addr;
1990 #define X8 "%2"SCNx8
1992 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1993 "%*x %*x %*x %*x %16s\n",
1994 &s6[0], &s6[1], &s6[2], &s6[3],
1995 &s6[4], &s6[5], &s6[6], &s6[7],
1996 &s6[8], &s6[9], &s6[10], &s6[11],
1997 &s6[12], &s6[13], &s6[14], &s6[15],
2001 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2002 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2004 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2006 struct netdev_dev_linux *netdev_dev =
2007 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2008 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2012 netdev_dev->in6 = in6addr_any;
2014 file = fopen("/proc/net/if_inet6", "r");
2016 const char *name = netdev_get_name(netdev_);
2017 while (fgets(line, sizeof line, file)) {
2018 struct in6_addr in6_tmp;
2019 char ifname[16 + 1];
2020 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2021 && !strcmp(name, ifname))
2023 netdev_dev->in6 = in6_tmp;
2029 netdev_dev->cache_valid |= VALID_IN6;
2031 *in6 = netdev_dev->in6;
2036 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2038 struct sockaddr_in sin;
2039 memset(&sin, 0, sizeof sin);
2040 sin.sin_family = AF_INET;
2041 sin.sin_addr = addr;
2044 memset(sa, 0, sizeof *sa);
2045 memcpy(sa, &sin, sizeof sin);
2049 do_set_addr(struct netdev *netdev,
2050 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2053 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2054 make_in4_sockaddr(&ifr.ifr_addr, addr);
2056 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2060 /* Adds 'router' as a default IP gateway. */
2062 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2064 struct in_addr any = { INADDR_ANY };
2068 memset(&rt, 0, sizeof rt);
2069 make_in4_sockaddr(&rt.rt_dst, any);
2070 make_in4_sockaddr(&rt.rt_gateway, router);
2071 make_in4_sockaddr(&rt.rt_genmask, any);
2072 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2073 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2075 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2081 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2084 static const char fn[] = "/proc/net/route";
2089 *netdev_name = NULL;
2090 stream = fopen(fn, "r");
2091 if (stream == NULL) {
2092 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2097 while (fgets(line, sizeof line, stream)) {
2100 ovs_be32 dest, gateway, mask;
2101 int refcnt, metric, mtu;
2102 unsigned int flags, use, window, irtt;
2105 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2107 iface, &dest, &gateway, &flags, &refcnt,
2108 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2110 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2114 if (!(flags & RTF_UP)) {
2115 /* Skip routes that aren't up. */
2119 /* The output of 'dest', 'mask', and 'gateway' were given in
2120 * network byte order, so we don't need need any endian
2121 * conversions here. */
2122 if ((dest & mask) == (host->s_addr & mask)) {
2124 /* The host is directly reachable. */
2125 next_hop->s_addr = 0;
2127 /* To reach the host, we must go through a gateway. */
2128 next_hop->s_addr = gateway;
2130 *netdev_name = xstrdup(iface);
2142 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2144 struct ethtool_drvinfo drvinfo;
2147 memset(&drvinfo, 0, sizeof drvinfo);
2148 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2149 (struct ethtool_cmd *)&drvinfo,
2151 "ETHTOOL_GDRVINFO");
2153 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2154 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2155 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2161 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2162 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2163 * returns 0. Otherwise, it returns a positive errno value; in particular,
2164 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2166 netdev_linux_arp_lookup(const struct netdev *netdev,
2167 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2170 struct sockaddr_in sin;
2173 memset(&r, 0, sizeof r);
2174 memset(&sin, 0, sizeof sin);
2175 sin.sin_family = AF_INET;
2176 sin.sin_addr.s_addr = ip;
2178 memcpy(&r.arp_pa, &sin, sizeof sin);
2179 r.arp_ha.sa_family = ARPHRD_ETHER;
2181 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2182 COVERAGE_INC(netdev_arp_lookup);
2183 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2185 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2186 } else if (retval != ENXIO) {
2187 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2188 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2194 nd_to_iff_flags(enum netdev_flags nd)
2197 if (nd & NETDEV_UP) {
2200 if (nd & NETDEV_PROMISC) {
2207 iff_to_nd_flags(int iff)
2209 enum netdev_flags nd = 0;
2213 if (iff & IFF_PROMISC) {
2214 nd |= NETDEV_PROMISC;
2220 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2221 enum netdev_flags on, enum netdev_flags *old_flagsp)
2223 int old_flags, new_flags;
2226 error = get_flags(netdev, &old_flags);
2228 *old_flagsp = iff_to_nd_flags(old_flags);
2229 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2230 if (new_flags != old_flags) {
2231 error = set_flags(netdev, new_flags);
2238 poll_notify(struct list *list)
2240 struct netdev_linux_notifier *notifier;
2241 LIST_FOR_EACH (notifier, node, list) {
2242 struct netdev_notifier *n = ¬ifier->notifier;
2248 netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
2249 void *aux OVS_UNUSED)
2252 struct list *list = shash_find_data(&netdev_linux_notifiers,
2258 struct shash_node *node;
2259 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2260 poll_notify(node->data);
2266 netdev_linux_poll_add(struct netdev *netdev,
2267 void (*cb)(struct netdev_notifier *), void *aux,
2268 struct netdev_notifier **notifierp)
2270 const char *netdev_name = netdev_get_name(netdev);
2271 struct netdev_linux_notifier *notifier;
2274 if (shash_is_empty(&netdev_linux_notifiers)) {
2276 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2277 netdev_linux_poll_cb, NULL);
2283 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2285 list = xmalloc(sizeof *list);
2287 shash_add(&netdev_linux_notifiers, netdev_name, list);
2290 notifier = xmalloc(sizeof *notifier);
2291 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2292 list_push_back(list, ¬ifier->node);
2293 *notifierp = ¬ifier->notifier;
2298 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2300 struct netdev_linux_notifier *notifier =
2301 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2304 /* Remove 'notifier' from its list. */
2305 list = list_remove(¬ifier->node);
2306 if (list_is_empty(list)) {
2307 /* The list is now empty. Remove it from the hash and free it. */
2308 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2309 shash_delete(&netdev_linux_notifiers,
2310 shash_find(&netdev_linux_notifiers, netdev_name));
2315 /* If that was the last notifier, unregister. */
2316 if (shash_is_empty(&netdev_linux_notifiers)) {
2317 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
2321 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2325 netdev_linux_init, \
2327 netdev_linux_wait, \
2330 netdev_linux_destroy, \
2331 NULL, /* set_config */ \
2333 netdev_linux_open, \
2334 netdev_linux_close, \
2338 netdev_linux_recv, \
2339 netdev_linux_recv_wait, \
2340 netdev_linux_drain, \
2342 netdev_linux_send, \
2343 netdev_linux_send_wait, \
2345 netdev_linux_set_etheraddr, \
2346 netdev_linux_get_etheraddr, \
2347 netdev_linux_get_mtu, \
2348 netdev_linux_get_ifindex, \
2349 netdev_linux_get_carrier, \
2350 netdev_linux_set_miimon_interval, \
2351 netdev_linux_get_stats, \
2354 netdev_linux_get_features, \
2355 netdev_linux_set_advertisements, \
2356 netdev_linux_get_vlan_vid, \
2358 netdev_linux_set_policing, \
2359 netdev_linux_get_qos_types, \
2360 netdev_linux_get_qos_capabilities, \
2361 netdev_linux_get_qos, \
2362 netdev_linux_set_qos, \
2363 netdev_linux_get_queue, \
2364 netdev_linux_set_queue, \
2365 netdev_linux_delete_queue, \
2366 netdev_linux_get_queue_stats, \
2367 netdev_linux_dump_queues, \
2368 netdev_linux_dump_queue_stats, \
2370 netdev_linux_get_in4, \
2371 netdev_linux_set_in4, \
2372 netdev_linux_get_in6, \
2373 netdev_linux_add_router, \
2374 netdev_linux_get_next_hop, \
2375 netdev_linux_get_status, \
2376 netdev_linux_arp_lookup, \
2378 netdev_linux_update_flags, \
2380 netdev_linux_poll_add, \
2381 netdev_linux_poll_remove \
2384 const struct netdev_class netdev_linux_class =
2387 netdev_linux_create,
2388 netdev_linux_enumerate,
2389 NULL); /* set_stats */
2391 const struct netdev_class netdev_tap_class =
2394 netdev_linux_create_tap,
2395 NULL, /* enumerate */
2396 NULL); /* set_stats */
2398 const struct netdev_class netdev_internal_class =
2401 netdev_linux_create,
2402 NULL, /* enumerate */
2403 netdev_vport_set_stats);
2405 /* HTB traffic control class. */
2407 #define HTB_N_QUEUES 0xf000
2411 unsigned int max_rate; /* In bytes/s. */
2415 struct tc_queue tc_queue;
2416 unsigned int min_rate; /* In bytes/s. */
2417 unsigned int max_rate; /* In bytes/s. */
2418 unsigned int burst; /* In bytes. */
2419 unsigned int priority; /* Lower values are higher priorities. */
2423 htb_get__(const struct netdev *netdev)
2425 struct netdev_dev_linux *netdev_dev =
2426 netdev_dev_linux_cast(netdev_get_dev(netdev));
2427 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2431 htb_install__(struct netdev *netdev, uint64_t max_rate)
2433 struct netdev_dev_linux *netdev_dev =
2434 netdev_dev_linux_cast(netdev_get_dev(netdev));
2437 htb = xmalloc(sizeof *htb);
2438 tc_init(&htb->tc, &tc_ops_htb);
2439 htb->max_rate = max_rate;
2441 netdev_dev->tc = &htb->tc;
2444 /* Create an HTB qdisc.
2446 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2448 htb_setup_qdisc__(struct netdev *netdev)
2451 struct tc_htb_glob opt;
2452 struct ofpbuf request;
2453 struct tcmsg *tcmsg;
2455 tc_del_qdisc(netdev);
2457 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2458 NLM_F_EXCL | NLM_F_CREATE, &request);
2462 tcmsg->tcm_handle = tc_make_handle(1, 0);
2463 tcmsg->tcm_parent = TC_H_ROOT;
2465 nl_msg_put_string(&request, TCA_KIND, "htb");
2467 memset(&opt, 0, sizeof opt);
2468 opt.rate2quantum = 10;
2472 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2473 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2474 nl_msg_end_nested(&request, opt_offset);
2476 return tc_transact(&request, NULL);
2479 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2480 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2482 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2483 unsigned int parent, struct htb_class *class)
2486 struct tc_htb_opt opt;
2487 struct ofpbuf request;
2488 struct tcmsg *tcmsg;
2492 netdev_get_mtu(netdev, &mtu);
2493 if (mtu == INT_MAX) {
2494 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2495 netdev_get_name(netdev));
2499 memset(&opt, 0, sizeof opt);
2500 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2501 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2502 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2503 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2504 opt.prio = class->priority;
2506 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2510 tcmsg->tcm_handle = handle;
2511 tcmsg->tcm_parent = parent;
2513 nl_msg_put_string(&request, TCA_KIND, "htb");
2514 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2515 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2516 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2517 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2518 nl_msg_end_nested(&request, opt_offset);
2520 error = tc_transact(&request, NULL);
2522 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2523 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2524 netdev_get_name(netdev),
2525 tc_get_major(handle), tc_get_minor(handle),
2526 tc_get_major(parent), tc_get_minor(parent),
2527 class->min_rate, class->max_rate,
2528 class->burst, class->priority, strerror(error));
2533 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2534 * description of them into 'details'. The description complies with the
2535 * specification given in the vswitch database documentation for linux-htb
2538 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2540 static const struct nl_policy tca_htb_policy[] = {
2541 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2542 .min_len = sizeof(struct tc_htb_opt) },
2545 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2546 const struct tc_htb_opt *htb;
2548 if (!nl_parse_nested(nl_options, tca_htb_policy,
2549 attrs, ARRAY_SIZE(tca_htb_policy))) {
2550 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2554 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2555 class->min_rate = htb->rate.rate;
2556 class->max_rate = htb->ceil.rate;
2557 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2558 class->priority = htb->prio;
2563 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2564 struct htb_class *options,
2565 struct netdev_queue_stats *stats)
2567 struct nlattr *nl_options;
2568 unsigned int handle;
2571 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2572 if (!error && queue_id) {
2573 unsigned int major = tc_get_major(handle);
2574 unsigned int minor = tc_get_minor(handle);
2575 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2576 *queue_id = minor - 1;
2581 if (!error && options) {
2582 error = htb_parse_tca_options__(nl_options, options);
2588 htb_parse_qdisc_details__(struct netdev *netdev,
2589 const struct shash *details, struct htb_class *hc)
2591 const char *max_rate_s;
2593 max_rate_s = shash_find_data(details, "max-rate");
2594 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2595 if (!hc->max_rate) {
2598 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2599 hc->max_rate = netdev_features_to_bps(current) / 8;
2601 hc->min_rate = hc->max_rate;
2607 htb_parse_class_details__(struct netdev *netdev,
2608 const struct shash *details, struct htb_class *hc)
2610 const struct htb *htb = htb_get__(netdev);
2611 const char *min_rate_s = shash_find_data(details, "min-rate");
2612 const char *max_rate_s = shash_find_data(details, "max-rate");
2613 const char *burst_s = shash_find_data(details, "burst");
2614 const char *priority_s = shash_find_data(details, "priority");
2617 netdev_get_mtu(netdev, &mtu);
2618 if (mtu == INT_MAX) {
2619 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2620 netdev_get_name(netdev));
2624 /* HTB requires at least an mtu sized min-rate to send any traffic even
2625 * on uncongested links. */
2626 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2627 hc->min_rate = MAX(hc->min_rate, mtu);
2628 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2631 hc->max_rate = (max_rate_s
2632 ? strtoull(max_rate_s, NULL, 10) / 8
2634 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2635 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2639 * According to hints in the documentation that I've read, it is important
2640 * that 'burst' be at least as big as the largest frame that might be
2641 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2642 * but having it a bit too small is a problem. Since netdev_get_mtu()
2643 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2644 * the MTU. We actually add 64, instead of 14, as a guard against
2645 * additional headers get tacked on somewhere that we're not aware of. */
2646 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2647 hc->burst = MAX(hc->burst, mtu + 64);
2650 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2656 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2657 unsigned int parent, struct htb_class *options,
2658 struct netdev_queue_stats *stats)
2660 struct ofpbuf *reply;
2663 error = tc_query_class(netdev, handle, parent, &reply);
2665 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2666 ofpbuf_delete(reply);
2672 htb_tc_install(struct netdev *netdev, const struct shash *details)
2676 error = htb_setup_qdisc__(netdev);
2678 struct htb_class hc;
2680 htb_parse_qdisc_details__(netdev, details, &hc);
2681 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2682 tc_make_handle(1, 0), &hc);
2684 htb_install__(netdev, hc.max_rate);
2690 static struct htb_class *
2691 htb_class_cast__(const struct tc_queue *queue)
2693 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2697 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2698 const struct htb_class *hc)
2700 struct htb *htb = htb_get__(netdev);
2701 size_t hash = hash_int(queue_id, 0);
2702 struct tc_queue *queue;
2703 struct htb_class *hcp;
2705 queue = tc_find_queue__(netdev, queue_id, hash);
2707 hcp = htb_class_cast__(queue);
2709 hcp = xmalloc(sizeof *hcp);
2710 queue = &hcp->tc_queue;
2711 queue->queue_id = queue_id;
2712 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2715 hcp->min_rate = hc->min_rate;
2716 hcp->max_rate = hc->max_rate;
2717 hcp->burst = hc->burst;
2718 hcp->priority = hc->priority;
2722 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2725 struct nl_dump dump;
2726 struct htb_class hc;
2728 /* Get qdisc options. */
2730 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2731 htb_install__(netdev, hc.max_rate);
2734 if (!start_queue_dump(netdev, &dump)) {
2737 while (nl_dump_next(&dump, &msg)) {
2738 unsigned int queue_id;
2740 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2741 htb_update_queue__(netdev, queue_id, &hc);
2744 nl_dump_done(&dump);
2750 htb_tc_destroy(struct tc *tc)
2752 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2753 struct htb_class *hc, *next;
2755 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2756 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2764 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2766 const struct htb *htb = htb_get__(netdev);
2767 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2772 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2774 struct htb_class hc;
2777 htb_parse_qdisc_details__(netdev, details, &hc);
2778 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2779 tc_make_handle(1, 0), &hc);
2781 htb_get__(netdev)->max_rate = hc.max_rate;
2787 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2788 const struct tc_queue *queue, struct shash *details)
2790 const struct htb_class *hc = htb_class_cast__(queue);
2792 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2793 if (hc->min_rate != hc->max_rate) {
2794 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2796 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2798 shash_add(details, "priority", xasprintf("%u", hc->priority));
2804 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2805 const struct shash *details)
2807 struct htb_class hc;
2810 error = htb_parse_class_details__(netdev, details, &hc);
2815 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2816 tc_make_handle(1, 0xfffe), &hc);
2821 htb_update_queue__(netdev, queue_id, &hc);
2826 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2828 struct htb_class *hc = htb_class_cast__(queue);
2829 struct htb *htb = htb_get__(netdev);
2832 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2834 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2841 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2842 struct netdev_queue_stats *stats)
2844 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2845 tc_make_handle(1, 0xfffe), NULL, stats);
2849 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2850 const struct ofpbuf *nlmsg,
2851 netdev_dump_queue_stats_cb *cb, void *aux)
2853 struct netdev_queue_stats stats;
2854 unsigned int handle, major, minor;
2857 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2862 major = tc_get_major(handle);
2863 minor = tc_get_minor(handle);
2864 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2865 (*cb)(minor - 1, &stats, aux);
2870 static const struct tc_ops tc_ops_htb = {
2871 "htb", /* linux_name */
2872 "linux-htb", /* ovs_name */
2873 HTB_N_QUEUES, /* n_queues */
2882 htb_class_get_stats,
2883 htb_class_dump_stats
2886 /* "linux-hfsc" traffic control class. */
2888 #define HFSC_N_QUEUES 0xf000
2896 struct tc_queue tc_queue;
2901 static struct hfsc *
2902 hfsc_get__(const struct netdev *netdev)
2904 struct netdev_dev_linux *netdev_dev;
2905 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2906 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2909 static struct hfsc_class *
2910 hfsc_class_cast__(const struct tc_queue *queue)
2912 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2916 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2918 struct netdev_dev_linux * netdev_dev;
2921 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2922 hfsc = xmalloc(sizeof *hfsc);
2923 tc_init(&hfsc->tc, &tc_ops_hfsc);
2924 hfsc->max_rate = max_rate;
2925 netdev_dev->tc = &hfsc->tc;
2929 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2930 const struct hfsc_class *hc)
2934 struct hfsc_class *hcp;
2935 struct tc_queue *queue;
2937 hfsc = hfsc_get__(netdev);
2938 hash = hash_int(queue_id, 0);
2940 queue = tc_find_queue__(netdev, queue_id, hash);
2942 hcp = hfsc_class_cast__(queue);
2944 hcp = xmalloc(sizeof *hcp);
2945 queue = &hcp->tc_queue;
2946 queue->queue_id = queue_id;
2947 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2950 hcp->min_rate = hc->min_rate;
2951 hcp->max_rate = hc->max_rate;
2955 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2957 const struct tc_service_curve *rsc, *fsc, *usc;
2958 static const struct nl_policy tca_hfsc_policy[] = {
2960 .type = NL_A_UNSPEC,
2962 .min_len = sizeof(struct tc_service_curve),
2965 .type = NL_A_UNSPEC,
2967 .min_len = sizeof(struct tc_service_curve),
2970 .type = NL_A_UNSPEC,
2972 .min_len = sizeof(struct tc_service_curve),
2975 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2977 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2978 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2979 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2983 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2984 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2985 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2987 if (rsc->m1 != 0 || rsc->d != 0 ||
2988 fsc->m1 != 0 || fsc->d != 0 ||
2989 usc->m1 != 0 || usc->d != 0) {
2990 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2991 "Non-linear service curves are not supported.");
2995 if (rsc->m2 != fsc->m2) {
2996 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2997 "Real-time service curves are not supported ");
3001 if (rsc->m2 > usc->m2) {
3002 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3003 "Min-rate service curve is greater than "
3004 "the max-rate service curve.");
3008 class->min_rate = fsc->m2;
3009 class->max_rate = usc->m2;
3014 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3015 struct hfsc_class *options,
3016 struct netdev_queue_stats *stats)
3019 unsigned int handle;
3020 struct nlattr *nl_options;
3022 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3028 unsigned int major, minor;
3030 major = tc_get_major(handle);
3031 minor = tc_get_minor(handle);
3032 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3033 *queue_id = minor - 1;
3040 error = hfsc_parse_tca_options__(nl_options, options);
3047 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3048 unsigned int parent, struct hfsc_class *options,
3049 struct netdev_queue_stats *stats)
3052 struct ofpbuf *reply;
3054 error = tc_query_class(netdev, handle, parent, &reply);
3059 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3060 ofpbuf_delete(reply);
3065 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3066 struct hfsc_class *class)
3069 const char *max_rate_s;
3071 max_rate_s = shash_find_data(details, "max-rate");
3072 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3077 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3078 max_rate = netdev_features_to_bps(current) / 8;
3081 class->min_rate = max_rate;
3082 class->max_rate = max_rate;
3086 hfsc_parse_class_details__(struct netdev *netdev,
3087 const struct shash *details,
3088 struct hfsc_class * class)
3090 const struct hfsc *hfsc;
3091 uint32_t min_rate, max_rate;
3092 const char *min_rate_s, *max_rate_s;
3094 hfsc = hfsc_get__(netdev);
3095 min_rate_s = shash_find_data(details, "min-rate");
3096 max_rate_s = shash_find_data(details, "max-rate");
3098 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3099 min_rate = MAX(min_rate, 1);
3100 min_rate = MIN(min_rate, hfsc->max_rate);
3102 max_rate = (max_rate_s
3103 ? strtoull(max_rate_s, NULL, 10) / 8
3105 max_rate = MAX(max_rate, min_rate);
3106 max_rate = MIN(max_rate, hfsc->max_rate);
3108 class->min_rate = min_rate;
3109 class->max_rate = max_rate;
3114 /* Create an HFSC qdisc.
3116 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3118 hfsc_setup_qdisc__(struct netdev * netdev)
3120 struct tcmsg *tcmsg;
3121 struct ofpbuf request;
3122 struct tc_hfsc_qopt opt;
3124 tc_del_qdisc(netdev);
3126 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3127 NLM_F_EXCL | NLM_F_CREATE, &request);
3133 tcmsg->tcm_handle = tc_make_handle(1, 0);
3134 tcmsg->tcm_parent = TC_H_ROOT;
3136 memset(&opt, 0, sizeof opt);
3139 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3140 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3142 return tc_transact(&request, NULL);
3145 /* Create an HFSC class.
3147 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3148 * sc rate <min_rate> ul rate <max_rate>" */
3150 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3151 unsigned int parent, struct hfsc_class *class)
3155 struct tcmsg *tcmsg;
3156 struct ofpbuf request;
3157 struct tc_service_curve min, max;
3159 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3165 tcmsg->tcm_handle = handle;
3166 tcmsg->tcm_parent = parent;
3170 min.m2 = class->min_rate;
3174 max.m2 = class->max_rate;
3176 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3177 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3178 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3179 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3180 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3181 nl_msg_end_nested(&request, opt_offset);
3183 error = tc_transact(&request, NULL);
3185 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3186 "min-rate %ubps, max-rate %ubps (%s)",
3187 netdev_get_name(netdev),
3188 tc_get_major(handle), tc_get_minor(handle),
3189 tc_get_major(parent), tc_get_minor(parent),
3190 class->min_rate, class->max_rate, strerror(error));
3197 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3200 struct hfsc_class class;
3202 error = hfsc_setup_qdisc__(netdev);
3208 hfsc_parse_qdisc_details__(netdev, details, &class);
3209 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3210 tc_make_handle(1, 0), &class);
3216 hfsc_install__(netdev, class.max_rate);
3221 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3224 struct nl_dump dump;
3225 struct hfsc_class hc;
3228 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3229 hfsc_install__(netdev, hc.max_rate);
3231 if (!start_queue_dump(netdev, &dump)) {
3235 while (nl_dump_next(&dump, &msg)) {
3236 unsigned int queue_id;
3238 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3239 hfsc_update_queue__(netdev, queue_id, &hc);
3243 nl_dump_done(&dump);
3248 hfsc_tc_destroy(struct tc *tc)
3251 struct hfsc_class *hc, *next;
3253 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3255 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3256 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3265 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3267 const struct hfsc *hfsc;
3268 hfsc = hfsc_get__(netdev);
3269 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3274 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3277 struct hfsc_class class;
3279 hfsc_parse_qdisc_details__(netdev, details, &class);
3280 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3281 tc_make_handle(1, 0), &class);
3284 hfsc_get__(netdev)->max_rate = class.max_rate;
3291 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3292 const struct tc_queue *queue, struct shash *details)
3294 const struct hfsc_class *hc;
3296 hc = hfsc_class_cast__(queue);
3297 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3298 if (hc->min_rate != hc->max_rate) {
3299 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3305 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3306 const struct shash *details)
3309 struct hfsc_class class;
3311 error = hfsc_parse_class_details__(netdev, details, &class);
3316 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3317 tc_make_handle(1, 0xfffe), &class);
3322 hfsc_update_queue__(netdev, queue_id, &class);
3327 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3331 struct hfsc_class *hc;
3333 hc = hfsc_class_cast__(queue);
3334 hfsc = hfsc_get__(netdev);
3336 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3338 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3345 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3346 struct netdev_queue_stats *stats)
3348 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3349 tc_make_handle(1, 0xfffe), NULL, stats);
3353 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3354 const struct ofpbuf *nlmsg,
3355 netdev_dump_queue_stats_cb *cb, void *aux)
3357 struct netdev_queue_stats stats;
3358 unsigned int handle, major, minor;
3361 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3366 major = tc_get_major(handle);
3367 minor = tc_get_minor(handle);
3368 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3369 (*cb)(minor - 1, &stats, aux);
3374 static const struct tc_ops tc_ops_hfsc = {
3375 "hfsc", /* linux_name */
3376 "linux-hfsc", /* ovs_name */
3377 HFSC_N_QUEUES, /* n_queues */
3378 hfsc_tc_install, /* tc_install */
3379 hfsc_tc_load, /* tc_load */
3380 hfsc_tc_destroy, /* tc_destroy */
3381 hfsc_qdisc_get, /* qdisc_get */
3382 hfsc_qdisc_set, /* qdisc_set */
3383 hfsc_class_get, /* class_get */
3384 hfsc_class_set, /* class_set */
3385 hfsc_class_delete, /* class_delete */
3386 hfsc_class_get_stats, /* class_get_stats */
3387 hfsc_class_dump_stats /* class_dump_stats */
3390 /* "linux-default" traffic control class.
3392 * This class represents the default, unnamed Linux qdisc. It corresponds to
3393 * the "" (empty string) QoS type in the OVS database. */
3396 default_install__(struct netdev *netdev)
3398 struct netdev_dev_linux *netdev_dev =
3399 netdev_dev_linux_cast(netdev_get_dev(netdev));
3400 static struct tc *tc;
3403 tc = xmalloc(sizeof *tc);
3404 tc_init(tc, &tc_ops_default);
3406 netdev_dev->tc = tc;
3410 default_tc_install(struct netdev *netdev,
3411 const struct shash *details OVS_UNUSED)
3413 default_install__(netdev);
3418 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3420 default_install__(netdev);
3424 static const struct tc_ops tc_ops_default = {
3425 NULL, /* linux_name */
3430 NULL, /* tc_destroy */
3431 NULL, /* qdisc_get */
3432 NULL, /* qdisc_set */
3433 NULL, /* class_get */
3434 NULL, /* class_set */
3435 NULL, /* class_delete */
3436 NULL, /* class_get_stats */
3437 NULL /* class_dump_stats */
3440 /* "linux-other" traffic control class.
3445 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3447 struct netdev_dev_linux *netdev_dev =
3448 netdev_dev_linux_cast(netdev_get_dev(netdev));
3449 static struct tc *tc;
3452 tc = xmalloc(sizeof *tc);
3453 tc_init(tc, &tc_ops_other);
3455 netdev_dev->tc = tc;
3459 static const struct tc_ops tc_ops_other = {
3460 NULL, /* linux_name */
3461 "linux-other", /* ovs_name */
3463 NULL, /* tc_install */
3465 NULL, /* tc_destroy */
3466 NULL, /* qdisc_get */
3467 NULL, /* qdisc_set */
3468 NULL, /* class_get */
3469 NULL, /* class_set */
3470 NULL, /* class_delete */
3471 NULL, /* class_get_stats */
3472 NULL /* class_dump_stats */
3475 /* Traffic control. */
3477 /* Number of kernel "tc" ticks per second. */
3478 static double ticks_per_s;
3480 /* Number of kernel "jiffies" per second. This is used for the purpose of
3481 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3482 * one jiffy's worth of data.
3484 * There are two possibilities here:
3486 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3487 * approximate range of 100 to 1024. That means that we really need to
3488 * make sure that the qdisc can buffer that much data.
3490 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3491 * has finely granular timers and there's no need to fudge additional room
3492 * for buffers. (There's no extra effort needed to implement that: the
3493 * large 'buffer_hz' is used as a divisor, so practically any number will
3494 * come out as 0 in the division. Small integer results in the case of
3495 * really high dividends won't have any real effect anyhow.)
3497 static unsigned int buffer_hz;
3499 /* Returns tc handle 'major':'minor'. */
3501 tc_make_handle(unsigned int major, unsigned int minor)
3503 return TC_H_MAKE(major << 16, minor);
3506 /* Returns the major number from 'handle'. */
3508 tc_get_major(unsigned int handle)
3510 return TC_H_MAJ(handle) >> 16;
3513 /* Returns the minor number from 'handle'. */
3515 tc_get_minor(unsigned int handle)
3517 return TC_H_MIN(handle);
3520 static struct tcmsg *
3521 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3522 struct ofpbuf *request)
3524 struct tcmsg *tcmsg;
3528 error = get_ifindex(netdev, &ifindex);
3533 ofpbuf_init(request, 512);
3534 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3535 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3536 tcmsg->tcm_family = AF_UNSPEC;
3537 tcmsg->tcm_ifindex = ifindex;
3538 /* Caller should fill in tcmsg->tcm_handle. */
3539 /* Caller should fill in tcmsg->tcm_parent. */
3545 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3547 int error = nl_sock_transact(rtnl_sock, request, replyp);
3548 ofpbuf_uninit(request);
3555 /* The values in psched are not individually very meaningful, but they are
3556 * important. The tables below show some values seen in the wild.
3560 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3561 * (Before that, there are hints that it was 1000000000.)
3563 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3567 * -----------------------------------
3568 * [1] 000c8000 000f4240 000f4240 00000064
3569 * [2] 000003e8 00000400 000f4240 3b9aca00
3570 * [3] 000003e8 00000400 000f4240 3b9aca00
3571 * [4] 000003e8 00000400 000f4240 00000064
3572 * [5] 000003e8 00000040 000f4240 3b9aca00
3573 * [6] 000003e8 00000040 000f4240 000000f9
3575 * a b c d ticks_per_s buffer_hz
3576 * ------- --------- ---------- ------------- ----------- -------------
3577 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3578 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3579 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3580 * [4] 1,000 1,024 1,000,000 100 976,562 100
3581 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3582 * [6] 1,000 64 1,000,000 249 15,625,000 249
3584 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3585 * [2] 2.6.26-1-686-bigmem from Debian lenny
3586 * [3] 2.6.26-2-sparc64 from Debian lenny
3587 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3588 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3589 * [6] 2.6.34 from kernel.org on KVM
3591 static const char fn[] = "/proc/net/psched";
3592 unsigned int a, b, c, d;
3598 stream = fopen(fn, "r");
3600 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3604 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3605 VLOG_WARN("%s: read failed", fn);
3609 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3613 VLOG_WARN("%s: invalid scheduler parameters", fn);
3617 ticks_per_s = (double) a * c / b;
3621 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3624 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3627 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3628 * rate of 'rate' bytes per second. */
3630 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3635 return (rate * ticks) / ticks_per_s;
3638 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3639 * rate of 'rate' bytes per second. */
3641 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3646 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3649 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3650 * a transmission rate of 'rate' bytes per second. */
3652 tc_buffer_per_jiffy(unsigned int rate)
3657 return rate / buffer_hz;
3660 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3661 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3662 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3663 * stores NULL into it if it is absent.
3665 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3668 * Returns 0 if successful, otherwise a positive errno value. */
3670 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3671 struct nlattr **options)
3673 static const struct nl_policy tca_policy[] = {
3674 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3675 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3677 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3679 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3680 tca_policy, ta, ARRAY_SIZE(ta))) {
3681 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3686 *kind = nl_attr_get_string(ta[TCA_KIND]);
3690 *options = ta[TCA_OPTIONS];
3705 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3706 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3707 * into '*options', and its queue statistics into '*stats'. Any of the output
3708 * arguments may be null.
3710 * Returns 0 if successful, otherwise a positive errno value. */
3712 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3713 struct nlattr **options, struct netdev_queue_stats *stats)
3715 static const struct nl_policy tca_policy[] = {
3716 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3717 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3719 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3721 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3722 tca_policy, ta, ARRAY_SIZE(ta))) {
3723 VLOG_WARN_RL(&rl, "failed to parse class message");
3728 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3729 *handlep = tc->tcm_handle;
3733 *options = ta[TCA_OPTIONS];
3737 const struct gnet_stats_queue *gsq;
3738 struct gnet_stats_basic gsb;
3740 static const struct nl_policy stats_policy[] = {
3741 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3742 .min_len = sizeof gsb },
3743 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3744 .min_len = sizeof *gsq },
3746 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3748 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3749 sa, ARRAY_SIZE(sa))) {
3750 VLOG_WARN_RL(&rl, "failed to parse class stats");
3754 /* Alignment issues screw up the length of struct gnet_stats_basic on
3755 * some arch/bitsize combinations. Newer versions of Linux have a
3756 * struct gnet_stats_basic_packed, but we can't depend on that. The
3757 * easiest thing to do is just to make a copy. */
3758 memset(&gsb, 0, sizeof gsb);
3759 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3760 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3761 stats->tx_bytes = gsb.bytes;
3762 stats->tx_packets = gsb.packets;
3764 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3765 stats->tx_errors = gsq->drops;
3775 memset(stats, 0, sizeof *stats);
3780 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3783 tc_query_class(const struct netdev *netdev,
3784 unsigned int handle, unsigned int parent,
3785 struct ofpbuf **replyp)
3787 struct ofpbuf request;
3788 struct tcmsg *tcmsg;
3791 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3795 tcmsg->tcm_handle = handle;
3796 tcmsg->tcm_parent = parent;
3798 error = tc_transact(&request, replyp);
3800 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3801 netdev_get_name(netdev),
3802 tc_get_major(handle), tc_get_minor(handle),
3803 tc_get_major(parent), tc_get_minor(parent),
3809 /* Equivalent to "tc class del dev <name> handle <handle>". */
3811 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3813 struct ofpbuf request;
3814 struct tcmsg *tcmsg;
3817 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3821 tcmsg->tcm_handle = handle;
3822 tcmsg->tcm_parent = 0;
3824 error = tc_transact(&request, NULL);
3826 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3827 netdev_get_name(netdev),
3828 tc_get_major(handle), tc_get_minor(handle),
3834 /* Equivalent to "tc qdisc del dev <name> root". */
3836 tc_del_qdisc(struct netdev *netdev)
3838 struct netdev_dev_linux *netdev_dev =
3839 netdev_dev_linux_cast(netdev_get_dev(netdev));
3840 struct ofpbuf request;
3841 struct tcmsg *tcmsg;
3844 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3848 tcmsg->tcm_handle = tc_make_handle(1, 0);
3849 tcmsg->tcm_parent = TC_H_ROOT;
3851 error = tc_transact(&request, NULL);
3852 if (error == EINVAL) {
3853 /* EINVAL probably means that the default qdisc was in use, in which
3854 * case we've accomplished our purpose. */
3857 if (!error && netdev_dev->tc) {
3858 if (netdev_dev->tc->ops->tc_destroy) {
3859 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3861 netdev_dev->tc = NULL;
3866 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3867 * kernel to determine what they are. Returns 0 if successful, otherwise a
3868 * positive errno value. */
3870 tc_query_qdisc(const struct netdev *netdev)
3872 struct netdev_dev_linux *netdev_dev =
3873 netdev_dev_linux_cast(netdev_get_dev(netdev));
3874 struct ofpbuf request, *qdisc;
3875 const struct tc_ops *ops;
3876 struct tcmsg *tcmsg;
3880 if (netdev_dev->tc) {
3884 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3885 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3886 * 2.6.35 without that fix backported to it.
3888 * To avoid the OOPS, we must not make a request that would attempt to dump
3889 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3890 * few others. There are a few ways that I can see to do this, but most of
3891 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3892 * technique chosen here is to assume that any non-default qdisc that we
3893 * create will have a class with handle 1:0. The built-in qdiscs only have
3894 * a class with handle 0:0.
3896 * We could check for Linux 2.6.35+ and use a more straightforward method
3898 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3902 tcmsg->tcm_handle = tc_make_handle(1, 0);
3903 tcmsg->tcm_parent = 0;
3905 /* Figure out what tc class to instantiate. */
3906 error = tc_transact(&request, &qdisc);
3910 error = tc_parse_qdisc(qdisc, &kind, NULL);
3912 ops = &tc_ops_other;
3914 ops = tc_lookup_linux_name(kind);
3916 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3917 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3919 ops = &tc_ops_other;
3922 } else if (error == ENOENT) {
3923 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3924 * other entity that doesn't have a handle 1:0. We will assume
3925 * that it's the system default qdisc. */
3926 ops = &tc_ops_default;
3929 /* Who knows? Maybe the device got deleted. */
3930 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3931 netdev_get_name(netdev), strerror(error));
3932 ops = &tc_ops_other;
3935 /* Instantiate it. */
3936 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3937 assert((load_error == 0) == (netdev_dev->tc != NULL));
3938 ofpbuf_delete(qdisc);
3940 return error ? error : load_error;
3943 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3944 approximate the time to transmit packets of various lengths. For an MTU of
3945 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3946 represents two possible packet lengths; for a MTU of 513 through 1024, four
3947 possible lengths; and so on.
3949 Returns, for the specified 'mtu', the number of bits that packet lengths
3950 need to be shifted right to fit within such a 256-entry table. */
3952 tc_calc_cell_log(unsigned int mtu)
3957 mtu = ETH_PAYLOAD_MAX;
3959 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3961 for (cell_log = 0; mtu >= 256; cell_log++) {
3968 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3971 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3973 memset(rate, 0, sizeof *rate);
3974 rate->cell_log = tc_calc_cell_log(mtu);
3975 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3976 /* rate->cell_align = 0; */ /* distro headers. */
3977 rate->mpu = ETH_TOTAL_MIN;
3981 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3982 * attribute of the specified "type".
3984 * See tc_calc_cell_log() above for a description of "rtab"s. */
3986 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3991 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3992 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3993 unsigned packet_size = (i + 1) << rate->cell_log;
3994 if (packet_size < rate->mpu) {
3995 packet_size = rate->mpu;
3997 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4001 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4002 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4003 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4006 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4008 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4009 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4012 /* Public utility functions. */
4014 #define COPY_NETDEV_STATS \
4015 dst->rx_packets = src->rx_packets; \
4016 dst->tx_packets = src->tx_packets; \
4017 dst->rx_bytes = src->rx_bytes; \
4018 dst->tx_bytes = src->tx_bytes; \
4019 dst->rx_errors = src->rx_errors; \
4020 dst->tx_errors = src->tx_errors; \
4021 dst->rx_dropped = src->rx_dropped; \
4022 dst->tx_dropped = src->tx_dropped; \
4023 dst->multicast = src->multicast; \
4024 dst->collisions = src->collisions; \
4025 dst->rx_length_errors = src->rx_length_errors; \
4026 dst->rx_over_errors = src->rx_over_errors; \
4027 dst->rx_crc_errors = src->rx_crc_errors; \
4028 dst->rx_frame_errors = src->rx_frame_errors; \
4029 dst->rx_fifo_errors = src->rx_fifo_errors; \
4030 dst->rx_missed_errors = src->rx_missed_errors; \
4031 dst->tx_aborted_errors = src->tx_aborted_errors; \
4032 dst->tx_carrier_errors = src->tx_carrier_errors; \
4033 dst->tx_fifo_errors = src->tx_fifo_errors; \
4034 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
4035 dst->tx_window_errors = src->tx_window_errors
4037 /* Copies 'src' into 'dst', performing format conversion in the process. */
4039 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4040 const struct rtnl_link_stats *src)
4045 /* Copies 'src' into 'dst', performing format conversion in the process. */
4047 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
4048 const struct rtnl_link_stats64 *src)
4053 /* Copies 'src' into 'dst', performing format conversion in the process. */
4055 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
4056 const struct netdev_stats *src)
4059 dst->rx_compressed = 0;
4060 dst->tx_compressed = 0;
4063 /* Utility functions. */
4066 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4068 /* Policy for RTNLGRP_LINK messages.
4070 * There are *many* more fields in these messages, but currently we only
4071 * care about these fields. */
4072 static const struct nl_policy rtnlgrp_link_policy[] = {
4073 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4074 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4075 .min_len = sizeof(struct rtnl_link_stats) },
4078 struct ofpbuf request;
4079 struct ofpbuf *reply;
4080 struct ifinfomsg *ifi;
4081 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4084 ofpbuf_init(&request, 0);
4085 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4086 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4087 ifi->ifi_family = PF_UNSPEC;
4088 ifi->ifi_index = ifindex;
4089 error = nl_sock_transact(rtnl_sock, &request, &reply);
4090 ofpbuf_uninit(&request);
4095 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4096 rtnlgrp_link_policy,
4097 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4098 ofpbuf_delete(reply);
4102 if (!attrs[IFLA_STATS]) {
4103 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4104 ofpbuf_delete(reply);
4108 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4110 ofpbuf_delete(reply);
4116 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4118 static const char fn[] = "/proc/net/dev";
4123 stream = fopen(fn, "r");
4125 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4130 while (fgets(line, sizeof line, stream)) {
4133 #define X64 "%"SCNu64
4136 X64 X64 X64 X64 X64 X64 X64 "%*u"
4137 X64 X64 X64 X64 X64 X64 X64 "%*u",
4143 &stats->rx_fifo_errors,
4144 &stats->rx_frame_errors,
4150 &stats->tx_fifo_errors,
4152 &stats->tx_carrier_errors) != 15) {
4153 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4154 } else if (!strcmp(devname, netdev_name)) {
4155 stats->rx_length_errors = UINT64_MAX;
4156 stats->rx_over_errors = UINT64_MAX;
4157 stats->rx_crc_errors = UINT64_MAX;
4158 stats->rx_missed_errors = UINT64_MAX;
4159 stats->tx_aborted_errors = UINT64_MAX;
4160 stats->tx_heartbeat_errors = UINT64_MAX;
4161 stats->tx_window_errors = UINT64_MAX;
4167 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4173 get_flags(const struct netdev *netdev, int *flags)
4178 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4180 *flags = ifr.ifr_flags;
4185 set_flags(struct netdev *netdev, int flags)
4189 ifr.ifr_flags = flags;
4190 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4195 do_get_ifindex(const char *netdev_name)
4199 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4200 COVERAGE_INC(netdev_get_ifindex);
4201 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4202 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4203 netdev_name, strerror(errno));
4206 return ifr.ifr_ifindex;
4210 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4212 struct netdev_dev_linux *netdev_dev =
4213 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4215 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4216 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4220 netdev_dev->cache_valid |= VALID_IFINDEX;
4221 netdev_dev->ifindex = ifindex;
4223 *ifindexp = netdev_dev->ifindex;
4228 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4233 memset(&ifr, 0, sizeof ifr);
4234 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4235 COVERAGE_INC(netdev_get_hwaddr);
4236 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4237 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4238 netdev_name, strerror(errno));
4241 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4242 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4243 VLOG_WARN("%s device has unknown hardware address family %d",
4244 netdev_name, hwaddr_family);
4246 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4251 set_etheraddr(const char *netdev_name, int hwaddr_family,
4252 const uint8_t mac[ETH_ADDR_LEN])
4256 memset(&ifr, 0, sizeof ifr);
4257 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4258 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4259 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4260 COVERAGE_INC(netdev_set_hwaddr);
4261 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4262 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4263 netdev_name, strerror(errno));
4270 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4271 int cmd, const char *cmd_name)
4275 memset(&ifr, 0, sizeof ifr);
4276 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4277 ifr.ifr_data = (caddr_t) ecmd;
4280 COVERAGE_INC(netdev_ethtool);
4281 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4284 if (errno != EOPNOTSUPP) {
4285 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4286 "failed: %s", cmd_name, name, strerror(errno));
4288 /* The device doesn't support this operation. That's pretty
4289 * common, so there's no point in logging anything. */
4296 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4297 const char *cmd_name)
4299 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4300 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4301 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4309 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4310 int cmd, const char *cmd_name)
4315 ifr.ifr_addr.sa_family = AF_INET;
4316 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4318 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4319 *ip = sin->sin_addr;
4324 /* Returns an AF_PACKET raw socket or a negative errno value. */
4326 af_packet_sock(void)
4328 static int sock = INT_MIN;
4330 if (sock == INT_MIN) {
4331 sock = socket(AF_PACKET, SOCK_RAW, 0);
4333 set_nonblocking(sock);
4336 VLOG_ERR("failed to create packet socket: %s", strerror(errno));