2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
96 #define TC_RTAB_SIZE 1024
99 static struct rtnetlink_notifier netdev_linux_cache_notifier;
100 static int cache_notifier_refcount;
103 VALID_IFINDEX = 1 << 0,
104 VALID_ETHERADDR = 1 << 1,
108 VALID_CARRIER = 1 << 5,
109 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
110 VALID_POLICING = 1 << 7,
111 VALID_HAVE_VPORT_STATS = 1 << 8
119 /* Traffic control. */
121 /* An instance of a traffic control class. Always associated with a particular
124 * Each TC implementation subclasses this with whatever additional data it
127 const struct tc_ops *ops;
128 struct hmap queues; /* Contains "struct tc_queue"s.
129 * Read by generic TC layer.
130 * Written only by TC implementation. */
133 /* One traffic control queue.
135 * Each TC implementation subclasses this with whatever additional data it
138 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
139 unsigned int queue_id; /* OpenFlow queue ID. */
142 /* A particular kind of traffic control. Each implementation generally maps to
143 * one particular Linux qdisc class.
145 * The functions below return 0 if successful or a positive errno value on
146 * failure, except where otherwise noted. All of them must be provided, except
147 * where otherwise noted. */
149 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
150 * This is null for tc_ops_default and tc_ops_other, for which there are no
151 * appropriate values. */
152 const char *linux_name;
154 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
155 const char *ovs_name;
157 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
158 * queues. The queues are numbered 0 through n_queues - 1. */
159 unsigned int n_queues;
161 /* Called to install this TC class on 'netdev'. The implementation should
162 * make the Netlink calls required to set up 'netdev' with the right qdisc
163 * and configure it according to 'details'. The implementation may assume
164 * that the current qdisc is the default; that is, there is no need for it
165 * to delete the current qdisc before installing itself.
167 * The contents of 'details' should be documented as valid for 'ovs_name'
168 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
169 * (which is built as ovs-vswitchd.conf.db(8)).
171 * This function must return 0 if and only if it sets 'netdev->tc' to an
172 * initialized 'struct tc'.
174 * (This function is null for tc_ops_other, which cannot be installed. For
175 * other TC classes it should always be nonnull.) */
176 int (*tc_install)(struct netdev *netdev, const struct shash *details);
178 /* Called when the netdev code determines (through a Netlink query) that
179 * this TC class's qdisc is installed on 'netdev', but we didn't install
180 * it ourselves and so don't know any of the details.
182 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
183 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
184 * implementation should parse the other attributes of 'nlmsg' as
185 * necessary to determine its configuration. If necessary it should also
186 * use Netlink queries to determine the configuration of queues on
189 * This function must return 0 if and only if it sets 'netdev->tc' to an
190 * initialized 'struct tc'. */
191 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
193 /* Destroys the data structures allocated by the implementation as part of
194 * 'tc'. (This includes destroying 'tc->queues' by calling
197 * The implementation should not need to perform any Netlink calls. If
198 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
199 * (But it may not be desirable.)
201 * This function may be null if 'tc' is trivial. */
202 void (*tc_destroy)(struct tc *tc);
204 /* Retrieves details of 'netdev->tc' configuration into 'details'.
206 * The implementation should not need to perform any Netlink calls, because
207 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
208 * cached the configuration.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
212 * (which is built as ovs-vswitchd.conf.db(8)).
214 * This function may be null if 'tc' is not configurable.
216 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
218 /* Reconfigures 'netdev->tc' according to 'details', performing any
219 * required Netlink calls to complete the reconfiguration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_set)(struct netdev *, const struct shash *details);
229 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
230 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "Queue" table in
234 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
236 * The implementation should not need to perform any Netlink calls, because
237 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
238 * cached the queue configuration.
240 * This function may be null if 'tc' does not have queues ('n_queues' is
242 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
243 struct shash *details);
245 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
246 * 'details', perfoming any required Netlink calls to complete the
247 * reconfiguration. The caller ensures that 'queue_id' is less than
250 * The contents of 'details' should be documented as valid for 'ovs_name'
251 * in the "other_config" column in the "Queue" table in
252 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
254 * This function may be null if 'tc' does not have queues or its queues are
255 * not configurable. */
256 int (*class_set)(struct netdev *, unsigned int queue_id,
257 const struct shash *details);
259 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
260 * tc_queue's within 'netdev->tc->queues'.
262 * This function may be null if 'tc' does not have queues or its queues
263 * cannot be deleted. */
264 int (*class_delete)(struct netdev *, struct tc_queue *queue);
266 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
267 * 'struct tc_queue's within 'netdev->tc->queues'.
269 * On success, initializes '*stats'.
271 * This function may be null if 'tc' does not have queues or if it cannot
272 * report queue statistics. */
273 int (*class_get_stats)(const struct netdev *netdev,
274 const struct tc_queue *queue,
275 struct netdev_queue_stats *stats);
277 /* Extracts queue stats from 'nlmsg', which is a response to a
278 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_dump_stats)(const struct netdev *netdev,
283 const struct ofpbuf *nlmsg,
284 netdev_dump_queue_stats_cb *cb, void *aux);
288 tc_init(struct tc *tc, const struct tc_ops *ops)
291 hmap_init(&tc->queues);
295 tc_destroy(struct tc *tc)
297 hmap_destroy(&tc->queues);
300 static const struct tc_ops tc_ops_htb;
301 static const struct tc_ops tc_ops_hfsc;
302 static const struct tc_ops tc_ops_default;
303 static const struct tc_ops tc_ops_other;
305 static const struct tc_ops *tcs[] = {
306 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
307 &tc_ops_hfsc, /* Hierarchical fair service curve. */
308 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
309 &tc_ops_other, /* Some other qdisc. */
313 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
314 static unsigned int tc_get_major(unsigned int handle);
315 static unsigned int tc_get_minor(unsigned int handle);
317 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
318 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
319 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
321 static struct tcmsg *tc_make_request(const struct netdev *, int type,
322 unsigned int flags, struct ofpbuf *);
323 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
325 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
326 struct nlattr **options);
327 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
328 struct nlattr **options,
329 struct netdev_queue_stats *);
330 static int tc_query_class(const struct netdev *,
331 unsigned int handle, unsigned int parent,
332 struct ofpbuf **replyp);
333 static int tc_delete_class(const struct netdev *, unsigned int handle);
335 static int tc_del_qdisc(struct netdev *netdev);
336 static int tc_query_qdisc(const struct netdev *netdev);
338 static int tc_calc_cell_log(unsigned int mtu);
339 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
340 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
341 const struct tc_ratespec *rate);
342 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
344 struct netdev_dev_linux {
345 struct netdev_dev netdev_dev;
347 struct shash_node *shash_node;
348 unsigned int cache_valid;
349 unsigned int change_seq;
351 bool miimon; /* Link status of last poll. */
352 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
353 struct timer miimon_timer;
355 /* The following are figured out "on demand" only. They are only valid
356 * when the corresponding VALID_* bit in 'cache_valid' is set. */
358 uint8_t etheraddr[ETH_ADDR_LEN];
359 struct in_addr address, netmask;
363 bool is_internal; /* Is this an openvswitch internal device? */
364 bool is_tap; /* Is this a tuntap device? */
365 uint32_t kbits_rate; /* Policing data. */
366 uint32_t kbits_burst;
367 bool have_vport_stats;
371 struct tap_state tap;
375 struct netdev_linux {
376 struct netdev netdev;
380 /* Sockets used for ioctl operations. */
381 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
383 /* A Netlink routing socket that is not subscribed to any multicast groups. */
384 static struct nl_sock *rtnl_sock;
386 /* This is set pretty low because we probably won't learn anything from the
387 * additional log messages. */
388 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
390 static int netdev_linux_init(void);
392 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
393 int cmd, const char *cmd_name);
394 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
395 const char *cmd_name);
396 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
397 int cmd, const char *cmd_name);
398 static int get_flags(const struct netdev *, int *flagsp);
399 static int set_flags(struct netdev *, int flags);
400 static int do_get_ifindex(const char *netdev_name);
401 static int get_ifindex(const struct netdev *, int *ifindexp);
402 static int do_set_addr(struct netdev *netdev,
403 int ioctl_nr, const char *ioctl_name,
404 struct in_addr addr);
405 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
406 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
407 const uint8_t[ETH_ADDR_LEN]);
408 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
409 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
410 static int af_packet_sock(void);
411 static void netdev_linux_miimon_run(void);
412 static void netdev_linux_miimon_wait(void);
415 is_netdev_linux_class(const struct netdev_class *netdev_class)
417 return netdev_class->init == netdev_linux_init;
420 static struct netdev_dev_linux *
421 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
423 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
424 assert(is_netdev_linux_class(netdev_class));
426 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
429 static struct netdev_linux *
430 netdev_linux_cast(const struct netdev *netdev)
432 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
433 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
434 assert(is_netdev_linux_class(netdev_class));
436 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
440 netdev_linux_init(void)
442 static int status = -1;
444 /* Create AF_INET socket. */
445 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
446 status = af_inet_sock >= 0 ? 0 : errno;
448 VLOG_ERR("failed to create inet socket: %s", strerror(status));
451 /* Create rtnetlink socket. */
453 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
455 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
464 netdev_linux_run(void)
466 rtnetlink_link_notifier_run();
467 netdev_linux_miimon_run();
471 netdev_linux_wait(void)
473 rtnetlink_link_notifier_wait();
474 netdev_linux_miimon_wait();
478 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
481 if (!dev->change_seq) {
484 dev->cache_valid = 0;
488 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
489 void *aux OVS_UNUSED)
491 struct netdev_dev_linux *dev;
493 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
495 const struct netdev_class *netdev_class =
496 netdev_dev_get_class(base_dev);
498 if (is_netdev_linux_class(netdev_class)) {
499 dev = netdev_dev_linux_cast(base_dev);
500 netdev_dev_linux_changed(dev);
504 struct shash device_shash;
505 struct shash_node *node;
507 shash_init(&device_shash);
508 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
509 SHASH_FOR_EACH (node, &device_shash) {
511 netdev_dev_linux_changed(dev);
513 shash_destroy(&device_shash);
517 /* Creates system and internal devices. */
519 netdev_linux_create(const struct netdev_class *class,
520 const char *name, const struct shash *args,
521 struct netdev_dev **netdev_devp)
523 struct netdev_dev_linux *netdev_dev;
526 if (!shash_is_empty(args)) {
527 VLOG_WARN("%s: arguments for %s devices should be empty",
531 if (!cache_notifier_refcount) {
532 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
533 netdev_linux_cache_cb, NULL);
538 cache_notifier_refcount++;
540 netdev_dev = xzalloc(sizeof *netdev_dev);
541 netdev_dev->change_seq = 1;
542 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
544 *netdev_devp = &netdev_dev->netdev_dev;
548 /* For most types of netdevs we open the device for each call of
549 * netdev_open(). However, this is not the case with tap devices,
550 * since it is only possible to open the device once. In this
551 * situation we share a single file descriptor, and consequently
552 * buffers, across all readers. Therefore once data is read it will
553 * be unavailable to other reads for tap devices. */
555 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
556 const char *name, const struct shash *args,
557 struct netdev_dev **netdev_devp)
559 struct netdev_dev_linux *netdev_dev;
560 struct tap_state *state;
561 static const char tap_dev[] = "/dev/net/tun";
565 if (!shash_is_empty(args)) {
566 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
569 netdev_dev = xzalloc(sizeof *netdev_dev);
570 state = &netdev_dev->state.tap;
572 /* Open tap device. */
573 state->fd = open(tap_dev, O_RDWR);
576 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
580 /* Create tap device. */
581 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
582 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
583 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
584 VLOG_WARN("%s: creating tap device failed: %s", name,
590 /* Make non-blocking. */
591 error = set_nonblocking(state->fd);
596 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
597 *netdev_devp = &netdev_dev->netdev_dev;
606 destroy_tap(struct netdev_dev_linux *netdev_dev)
608 struct tap_state *state = &netdev_dev->state.tap;
610 if (state->fd >= 0) {
615 /* Destroys the netdev device 'netdev_dev_'. */
617 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
619 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
620 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
622 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
623 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
626 if (class == &netdev_linux_class || class == &netdev_internal_class) {
627 cache_notifier_refcount--;
629 if (!cache_notifier_refcount) {
630 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
632 } else if (class == &netdev_tap_class) {
633 destroy_tap(netdev_dev);
642 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
643 struct netdev **netdevp)
645 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
646 struct netdev_linux *netdev;
647 enum netdev_flags flags;
650 /* Allocate network device. */
651 netdev = xzalloc(sizeof *netdev);
653 netdev_init(&netdev->netdev, netdev_dev_);
655 /* Verify that the device really exists, by attempting to read its flags.
656 * (The flags might be cached, in which case this won't actually do an
659 * Don't do this for "internal" netdevs, though, because those have to be
660 * created as netdev objects before they exist in the kernel, because
661 * creating them in the kernel happens by passing a netdev object to
662 * dpif_port_add(). */
663 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
664 error = netdev_get_flags(&netdev->netdev, &flags);
665 if (error == ENODEV) {
670 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
671 !netdev_dev->state.tap.opened) {
673 /* We assume that the first user of the tap device is the primary user
674 * and give them the tap FD. Subsequent users probably just expect
675 * this to be a system device so open it normally to avoid send/receive
676 * directions appearing to be reversed. */
677 netdev->fd = netdev_dev->state.tap.fd;
678 netdev_dev->state.tap.opened = true;
679 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
680 struct sockaddr_ll sll;
684 /* Create file descriptor. */
685 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
686 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
688 netdev->fd = socket(PF_PACKET, SOCK_RAW,
689 (OVS_FORCE int) htons(protocol));
690 if (netdev->fd < 0) {
695 /* Set non-blocking mode. */
696 error = set_nonblocking(netdev->fd);
701 /* Get ethernet device index. */
702 error = get_ifindex(&netdev->netdev, &ifindex);
707 /* Bind to specific ethernet device. */
708 memset(&sll, 0, sizeof sll);
709 sll.sll_family = AF_PACKET;
710 sll.sll_ifindex = ifindex;
712 (struct sockaddr *) &sll, sizeof sll) < 0) {
714 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
719 /* Between the socket() and bind() calls above, the socket receives all
720 * packets of the requested type on all system interfaces. We do not
721 * want to receive that data, but there is no way to avoid it. So we
722 * must now drain out the receive queue. */
723 error = drain_rcvbuf(netdev->fd);
729 *netdevp = &netdev->netdev;
733 netdev_uninit(&netdev->netdev, true);
737 /* Closes and destroys 'netdev'. */
739 netdev_linux_close(struct netdev *netdev_)
741 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
743 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
749 /* Initializes 'sset' with a list of the names of all known network devices. */
751 netdev_linux_enumerate(struct sset *sset)
753 struct if_nameindex *names;
755 names = if_nameindex();
759 for (i = 0; names[i].if_name != NULL; i++) {
760 sset_add(sset, names[i].if_name);
762 if_freenameindex(names);
765 VLOG_WARN("could not obtain list of network device names: %s",
772 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
774 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
776 if (netdev->fd < 0) {
777 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
782 ssize_t retval = read(netdev->fd, data, size);
785 } else if (errno != EINTR) {
786 if (errno != EAGAIN) {
787 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
788 strerror(errno), netdev_get_name(netdev_));
795 /* Registers with the poll loop to wake up from the next call to poll_block()
796 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
798 netdev_linux_recv_wait(struct netdev *netdev_)
800 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
801 if (netdev->fd >= 0) {
802 poll_fd_wait(netdev->fd, POLLIN);
806 /* Discards all packets waiting to be received from 'netdev'. */
808 netdev_linux_drain(struct netdev *netdev_)
810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
811 if (netdev->fd < 0) {
813 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
815 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
816 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
820 drain_fd(netdev->fd, ifr.ifr_qlen);
823 return drain_rcvbuf(netdev->fd);
827 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
828 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
829 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
830 * the packet is too big or too small to transmit on the device.
832 * The caller retains ownership of 'buffer' in all cases.
834 * The kernel maintains a packet transmission queue, so the caller is not
835 * expected to do additional queuing of packets. */
837 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
839 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
843 if (netdev->fd < 0) {
844 /* Use our AF_PACKET socket to send to this device. */
845 struct sockaddr_ll sll;
852 sock = af_packet_sock();
857 error = get_ifindex(netdev_, &ifindex);
862 /* We don't bother setting most fields in sockaddr_ll because the
863 * kernel ignores them for SOCK_RAW. */
864 memset(&sll, 0, sizeof sll);
865 sll.sll_family = AF_PACKET;
866 sll.sll_ifindex = ifindex;
868 iov.iov_base = (void *) data;
872 msg.msg_namelen = sizeof sll;
875 msg.msg_control = NULL;
876 msg.msg_controllen = 0;
879 retval = sendmsg(sock, &msg, 0);
881 /* Use the netdev's own fd to send to this device. This is
882 * essential for tap devices, because packets sent to a tap device
883 * with an AF_PACKET socket will loop back to be *received* again
884 * on the tap device. */
885 retval = write(netdev->fd, data, size);
889 /* The Linux AF_PACKET implementation never blocks waiting for room
890 * for packets, instead returning ENOBUFS. Translate this into
891 * EAGAIN for the caller. */
892 if (errno == ENOBUFS) {
894 } else if (errno == EINTR) {
896 } else if (errno != EAGAIN) {
897 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
898 netdev_get_name(netdev_), strerror(errno));
901 } else if (retval != size) {
902 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
903 "%zu) on %s", retval, size, netdev_get_name(netdev_));
911 /* Registers with the poll loop to wake up from the next call to poll_block()
912 * when the packet transmission queue has sufficient room to transmit a packet
913 * with netdev_send().
915 * The kernel maintains a packet transmission queue, so the client is not
916 * expected to do additional queuing of packets. Thus, this function is
917 * unlikely to ever be used. It is included for completeness. */
919 netdev_linux_send_wait(struct netdev *netdev_)
921 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
922 if (netdev->fd < 0) {
924 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
925 poll_fd_wait(netdev->fd, POLLOUT);
927 /* TAP device always accepts packets.*/
928 poll_immediate_wake();
932 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
933 * otherwise a positive errno value. */
935 netdev_linux_set_etheraddr(struct netdev *netdev_,
936 const uint8_t mac[ETH_ADDR_LEN])
938 struct netdev_dev_linux *netdev_dev =
939 netdev_dev_linux_cast(netdev_get_dev(netdev_));
942 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
943 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
944 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
946 netdev_dev->cache_valid |= VALID_ETHERADDR;
947 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
955 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
956 * free the returned buffer. */
958 netdev_linux_get_etheraddr(const struct netdev *netdev_,
959 uint8_t mac[ETH_ADDR_LEN])
961 struct netdev_dev_linux *netdev_dev =
962 netdev_dev_linux_cast(netdev_get_dev(netdev_));
963 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
964 int error = get_etheraddr(netdev_get_name(netdev_),
965 netdev_dev->etheraddr);
969 netdev_dev->cache_valid |= VALID_ETHERADDR;
971 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
975 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
976 * in bytes, not including the hardware header; thus, this is typically 1500
977 * bytes for Ethernet devices. */
979 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
981 struct netdev_dev_linux *netdev_dev =
982 netdev_dev_linux_cast(netdev_get_dev(netdev_));
983 if (!(netdev_dev->cache_valid & VALID_MTU)) {
987 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
988 SIOCGIFMTU, "SIOCGIFMTU");
992 netdev_dev->mtu = ifr.ifr_mtu;
993 netdev_dev->cache_valid |= VALID_MTU;
995 *mtup = netdev_dev->mtu;
999 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1000 * On failure, returns a negative errno value. */
1002 netdev_linux_get_ifindex(const struct netdev *netdev)
1006 error = get_ifindex(netdev, &ifindex);
1007 return error ? -error : ifindex;
1011 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1013 struct netdev_dev_linux *netdev_dev =
1014 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1019 if (netdev_dev->miimon_interval > 0) {
1020 *carrier = netdev_dev->miimon;
1024 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1028 fn = xasprintf("/sys/class/net/%s/carrier",
1029 netdev_get_name(netdev_));
1030 fd = open(fn, O_RDONLY);
1033 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1037 retval = read(fd, line, sizeof line);
1040 if (error == EINVAL) {
1041 /* This is the normal return value when we try to check carrier
1042 * if the network device is not up. */
1044 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1047 } else if (retval == 0) {
1049 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1053 if (line[0] != '0' && line[0] != '1') {
1055 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1059 netdev_dev->carrier = line[0] != '0';
1060 netdev_dev->cache_valid |= VALID_CARRIER;
1062 *carrier = netdev_dev->carrier;
1074 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1075 struct mii_ioctl_data *data)
1080 memset(&ifr, 0, sizeof ifr);
1081 memcpy(&ifr.ifr_data, data, sizeof *data);
1082 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1083 memcpy(data, &ifr.ifr_data, sizeof *data);
1089 netdev_linux_get_miimon(const char *name, bool *miimon)
1091 struct mii_ioctl_data data;
1096 memset(&data, 0, sizeof data);
1097 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1099 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1100 data.reg_num = MII_BMSR;
1101 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1105 *miimon = !!(data.val_out & BMSR_LSTATUS);
1107 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1110 struct ethtool_cmd ecmd;
1112 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1115 memset(&ecmd, 0, sizeof ecmd);
1116 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1119 struct ethtool_value eval;
1121 memcpy(&eval, &ecmd, sizeof eval);
1122 *miimon = !!eval.data;
1124 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1132 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1133 long long int interval)
1135 struct netdev_dev_linux *netdev_dev;
1137 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1139 interval = interval > 0 ? MAX(interval, 100) : 0;
1140 if (netdev_dev->miimon_interval != interval) {
1141 netdev_dev->miimon_interval = interval;
1142 timer_set_expired(&netdev_dev->miimon_timer);
1149 netdev_linux_miimon_run(void)
1151 struct shash device_shash;
1152 struct shash_node *node;
1154 shash_init(&device_shash);
1155 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1156 SHASH_FOR_EACH (node, &device_shash) {
1157 struct netdev_dev_linux *dev = node->data;
1160 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1164 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1165 if (miimon != dev->miimon) {
1166 dev->miimon = miimon;
1167 netdev_dev_linux_changed(dev);
1170 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1173 shash_destroy(&device_shash);
1177 netdev_linux_miimon_wait(void)
1179 struct shash device_shash;
1180 struct shash_node *node;
1182 shash_init(&device_shash);
1183 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1184 SHASH_FOR_EACH (node, &device_shash) {
1185 struct netdev_dev_linux *dev = node->data;
1187 if (dev->miimon_interval > 0) {
1188 timer_wait(&dev->miimon_timer);
1191 shash_destroy(&device_shash);
1194 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1195 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1198 check_for_working_netlink_stats(void)
1200 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1201 * preferable, so if that works, we'll use it. */
1202 int ifindex = do_get_ifindex("lo");
1204 VLOG_WARN("failed to get ifindex for lo, "
1205 "obtaining netdev stats from proc");
1208 struct netdev_stats stats;
1209 int error = get_stats_via_netlink(ifindex, &stats);
1211 VLOG_DBG("obtaining netdev stats via rtnetlink");
1214 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1215 "via proc (you are probably running a pre-2.6.19 "
1216 "kernel)", strerror(error));
1222 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1224 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1226 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1227 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1228 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1230 netdev_dev->is_tap = !strcmp(type, "tap");
1231 netdev_dev->is_internal = (!netdev_dev->is_tap
1232 && dpif_linux_is_internal_device(name));
1233 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1238 swap_uint64(uint64_t *a, uint64_t *b)
1245 /* Retrieves current device stats for 'netdev'. */
1247 netdev_linux_get_stats(const struct netdev *netdev_,
1248 struct netdev_stats *stats)
1250 struct netdev_dev_linux *netdev_dev =
1251 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1252 static int use_netlink_stats = -1;
1255 if (netdev_dev->have_vport_stats ||
1256 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1258 error = netdev_vport_get_stats(netdev_, stats);
1259 netdev_dev->have_vport_stats = !error;
1260 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1263 if (!netdev_dev->have_vport_stats) {
1264 if (use_netlink_stats < 0) {
1265 use_netlink_stats = check_for_working_netlink_stats();
1267 if (use_netlink_stats) {
1270 error = get_ifindex(netdev_, &ifindex);
1272 error = get_stats_via_netlink(ifindex, stats);
1275 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1279 /* If this port is an internal port then the transmit and receive stats
1280 * will appear to be swapped relative to the other ports since we are the
1281 * one sending the data, not a remote computer. For consistency, we swap
1282 * them back here. This does not apply if we are getting stats from the
1283 * vport layer because it always tracks stats from the perspective of the
1285 netdev_linux_update_is_pseudo(netdev_dev);
1286 if (!error && !netdev_dev->have_vport_stats &&
1287 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1288 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1289 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1290 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1291 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1292 stats->rx_length_errors = 0;
1293 stats->rx_over_errors = 0;
1294 stats->rx_crc_errors = 0;
1295 stats->rx_frame_errors = 0;
1296 stats->rx_fifo_errors = 0;
1297 stats->rx_missed_errors = 0;
1298 stats->tx_aborted_errors = 0;
1299 stats->tx_carrier_errors = 0;
1300 stats->tx_fifo_errors = 0;
1301 stats->tx_heartbeat_errors = 0;
1302 stats->tx_window_errors = 0;
1308 /* Stores the features supported by 'netdev' into each of '*current',
1309 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1310 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1311 * successful, otherwise a positive errno value. */
1313 netdev_linux_get_features(const struct netdev *netdev,
1314 uint32_t *current, uint32_t *advertised,
1315 uint32_t *supported, uint32_t *peer)
1317 struct ethtool_cmd ecmd;
1320 memset(&ecmd, 0, sizeof ecmd);
1321 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1322 ETHTOOL_GSET, "ETHTOOL_GSET");
1327 /* Supported features. */
1329 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1330 *supported |= OFPPF_10MB_HD;
1332 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1333 *supported |= OFPPF_10MB_FD;
1335 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1336 *supported |= OFPPF_100MB_HD;
1338 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1339 *supported |= OFPPF_100MB_FD;
1341 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1342 *supported |= OFPPF_1GB_HD;
1344 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1345 *supported |= OFPPF_1GB_FD;
1347 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1348 *supported |= OFPPF_10GB_FD;
1350 if (ecmd.supported & SUPPORTED_TP) {
1351 *supported |= OFPPF_COPPER;
1353 if (ecmd.supported & SUPPORTED_FIBRE) {
1354 *supported |= OFPPF_FIBER;
1356 if (ecmd.supported & SUPPORTED_Autoneg) {
1357 *supported |= OFPPF_AUTONEG;
1359 if (ecmd.supported & SUPPORTED_Pause) {
1360 *supported |= OFPPF_PAUSE;
1362 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1363 *supported |= OFPPF_PAUSE_ASYM;
1366 /* Advertised features. */
1368 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1369 *advertised |= OFPPF_10MB_HD;
1371 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1372 *advertised |= OFPPF_10MB_FD;
1374 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1375 *advertised |= OFPPF_100MB_HD;
1377 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1378 *advertised |= OFPPF_100MB_FD;
1380 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1381 *advertised |= OFPPF_1GB_HD;
1383 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1384 *advertised |= OFPPF_1GB_FD;
1386 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1387 *advertised |= OFPPF_10GB_FD;
1389 if (ecmd.advertising & ADVERTISED_TP) {
1390 *advertised |= OFPPF_COPPER;
1392 if (ecmd.advertising & ADVERTISED_FIBRE) {
1393 *advertised |= OFPPF_FIBER;
1395 if (ecmd.advertising & ADVERTISED_Autoneg) {
1396 *advertised |= OFPPF_AUTONEG;
1398 if (ecmd.advertising & ADVERTISED_Pause) {
1399 *advertised |= OFPPF_PAUSE;
1401 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1402 *advertised |= OFPPF_PAUSE_ASYM;
1405 /* Current settings. */
1406 if (ecmd.speed == SPEED_10) {
1407 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1408 } else if (ecmd.speed == SPEED_100) {
1409 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1410 } else if (ecmd.speed == SPEED_1000) {
1411 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1412 } else if (ecmd.speed == SPEED_10000) {
1413 *current = OFPPF_10GB_FD;
1418 if (ecmd.port == PORT_TP) {
1419 *current |= OFPPF_COPPER;
1420 } else if (ecmd.port == PORT_FIBRE) {
1421 *current |= OFPPF_FIBER;
1425 *current |= OFPPF_AUTONEG;
1428 /* Peer advertisements. */
1429 *peer = 0; /* XXX */
1434 /* Set the features advertised by 'netdev' to 'advertise'. */
1436 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1438 struct ethtool_cmd ecmd;
1441 memset(&ecmd, 0, sizeof ecmd);
1442 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1443 ETHTOOL_GSET, "ETHTOOL_GSET");
1448 ecmd.advertising = 0;
1449 if (advertise & OFPPF_10MB_HD) {
1450 ecmd.advertising |= ADVERTISED_10baseT_Half;
1452 if (advertise & OFPPF_10MB_FD) {
1453 ecmd.advertising |= ADVERTISED_10baseT_Full;
1455 if (advertise & OFPPF_100MB_HD) {
1456 ecmd.advertising |= ADVERTISED_100baseT_Half;
1458 if (advertise & OFPPF_100MB_FD) {
1459 ecmd.advertising |= ADVERTISED_100baseT_Full;
1461 if (advertise & OFPPF_1GB_HD) {
1462 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1464 if (advertise & OFPPF_1GB_FD) {
1465 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1467 if (advertise & OFPPF_10GB_FD) {
1468 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1470 if (advertise & OFPPF_COPPER) {
1471 ecmd.advertising |= ADVERTISED_TP;
1473 if (advertise & OFPPF_FIBER) {
1474 ecmd.advertising |= ADVERTISED_FIBRE;
1476 if (advertise & OFPPF_AUTONEG) {
1477 ecmd.advertising |= ADVERTISED_Autoneg;
1479 if (advertise & OFPPF_PAUSE) {
1480 ecmd.advertising |= ADVERTISED_Pause;
1482 if (advertise & OFPPF_PAUSE_ASYM) {
1483 ecmd.advertising |= ADVERTISED_Asym_Pause;
1485 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1486 ETHTOOL_SSET, "ETHTOOL_SSET");
1489 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1490 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1491 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1492 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1493 * sets '*vlan_vid' to -1. */
1495 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1497 const char *netdev_name = netdev_get_name(netdev);
1498 struct ds line = DS_EMPTY_INITIALIZER;
1499 FILE *stream = NULL;
1503 COVERAGE_INC(netdev_get_vlan_vid);
1504 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1505 stream = fopen(fn, "r");
1511 if (ds_get_line(&line, stream)) {
1512 if (ferror(stream)) {
1514 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1517 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1522 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1524 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1525 fn, ds_cstr(&line));
1543 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1544 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1546 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1547 * positive errno value.
1549 * This function is equivalent to running
1550 * /sbin/tc qdisc del dev %s handle ffff: ingress
1551 * but it is much, much faster.
1554 netdev_linux_remove_policing(struct netdev *netdev)
1556 struct netdev_dev_linux *netdev_dev =
1557 netdev_dev_linux_cast(netdev_get_dev(netdev));
1558 const char *netdev_name = netdev_get_name(netdev);
1560 struct ofpbuf request;
1561 struct tcmsg *tcmsg;
1564 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1568 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1569 tcmsg->tcm_parent = TC_H_INGRESS;
1570 nl_msg_put_string(&request, TCA_KIND, "ingress");
1571 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1573 error = tc_transact(&request, NULL);
1574 if (error && error != ENOENT && error != EINVAL) {
1575 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1576 netdev_name, strerror(error));
1580 netdev_dev->kbits_rate = 0;
1581 netdev_dev->kbits_burst = 0;
1582 netdev_dev->cache_valid |= VALID_POLICING;
1586 /* Attempts to set input rate limiting (policing) policy. */
1588 netdev_linux_set_policing(struct netdev *netdev,
1589 uint32_t kbits_rate, uint32_t kbits_burst)
1591 struct netdev_dev_linux *netdev_dev =
1592 netdev_dev_linux_cast(netdev_get_dev(netdev));
1593 const char *netdev_name = netdev_get_name(netdev);
1596 COVERAGE_INC(netdev_set_policing);
1598 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1599 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1600 : kbits_burst); /* Stick with user-specified value. */
1602 if (netdev_dev->cache_valid & VALID_POLICING
1603 && netdev_dev->kbits_rate == kbits_rate
1604 && netdev_dev->kbits_burst == kbits_burst) {
1605 /* Assume that settings haven't changed since we last set them. */
1609 netdev_linux_remove_policing(netdev);
1611 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1612 if (system(command) != 0) {
1613 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1617 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1618 kbits_rate, kbits_burst);
1619 if (system(command) != 0) {
1620 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1625 netdev_dev->kbits_rate = kbits_rate;
1626 netdev_dev->kbits_burst = kbits_burst;
1627 netdev_dev->cache_valid |= VALID_POLICING;
1634 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1637 const struct tc_ops **opsp;
1639 for (opsp = tcs; *opsp != NULL; opsp++) {
1640 const struct tc_ops *ops = *opsp;
1641 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1642 sset_add(types, ops->ovs_name);
1648 static const struct tc_ops *
1649 tc_lookup_ovs_name(const char *name)
1651 const struct tc_ops **opsp;
1653 for (opsp = tcs; *opsp != NULL; opsp++) {
1654 const struct tc_ops *ops = *opsp;
1655 if (!strcmp(name, ops->ovs_name)) {
1662 static const struct tc_ops *
1663 tc_lookup_linux_name(const char *name)
1665 const struct tc_ops **opsp;
1667 for (opsp = tcs; *opsp != NULL; opsp++) {
1668 const struct tc_ops *ops = *opsp;
1669 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1676 static struct tc_queue *
1677 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1680 struct netdev_dev_linux *netdev_dev =
1681 netdev_dev_linux_cast(netdev_get_dev(netdev));
1682 struct tc_queue *queue;
1684 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1685 if (queue->queue_id == queue_id) {
1692 static struct tc_queue *
1693 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1695 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1699 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1701 struct netdev_qos_capabilities *caps)
1703 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1707 caps->n_queues = ops->n_queues;
1712 netdev_linux_get_qos(const struct netdev *netdev,
1713 const char **typep, struct shash *details)
1715 struct netdev_dev_linux *netdev_dev =
1716 netdev_dev_linux_cast(netdev_get_dev(netdev));
1719 error = tc_query_qdisc(netdev);
1724 *typep = netdev_dev->tc->ops->ovs_name;
1725 return (netdev_dev->tc->ops->qdisc_get
1726 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1731 netdev_linux_set_qos(struct netdev *netdev,
1732 const char *type, const struct shash *details)
1734 struct netdev_dev_linux *netdev_dev =
1735 netdev_dev_linux_cast(netdev_get_dev(netdev));
1736 const struct tc_ops *new_ops;
1739 new_ops = tc_lookup_ovs_name(type);
1740 if (!new_ops || !new_ops->tc_install) {
1744 error = tc_query_qdisc(netdev);
1749 if (new_ops == netdev_dev->tc->ops) {
1750 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1752 /* Delete existing qdisc. */
1753 error = tc_del_qdisc(netdev);
1757 assert(netdev_dev->tc == NULL);
1759 /* Install new qdisc. */
1760 error = new_ops->tc_install(netdev, details);
1761 assert((error == 0) == (netdev_dev->tc != NULL));
1768 netdev_linux_get_queue(const struct netdev *netdev,
1769 unsigned int queue_id, struct shash *details)
1771 struct netdev_dev_linux *netdev_dev =
1772 netdev_dev_linux_cast(netdev_get_dev(netdev));
1775 error = tc_query_qdisc(netdev);
1779 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1781 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1787 netdev_linux_set_queue(struct netdev *netdev,
1788 unsigned int queue_id, const struct shash *details)
1790 struct netdev_dev_linux *netdev_dev =
1791 netdev_dev_linux_cast(netdev_get_dev(netdev));
1794 error = tc_query_qdisc(netdev);
1797 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1798 || !netdev_dev->tc->ops->class_set) {
1802 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1806 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1808 struct netdev_dev_linux *netdev_dev =
1809 netdev_dev_linux_cast(netdev_get_dev(netdev));
1812 error = tc_query_qdisc(netdev);
1815 } else if (!netdev_dev->tc->ops->class_delete) {
1818 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1820 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1826 netdev_linux_get_queue_stats(const struct netdev *netdev,
1827 unsigned int queue_id,
1828 struct netdev_queue_stats *stats)
1830 struct netdev_dev_linux *netdev_dev =
1831 netdev_dev_linux_cast(netdev_get_dev(netdev));
1834 error = tc_query_qdisc(netdev);
1837 } else if (!netdev_dev->tc->ops->class_get_stats) {
1840 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1842 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1848 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1850 struct ofpbuf request;
1851 struct tcmsg *tcmsg;
1853 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1857 tcmsg->tcm_parent = 0;
1858 nl_dump_start(dump, rtnl_sock, &request);
1859 ofpbuf_uninit(&request);
1864 netdev_linux_dump_queues(const struct netdev *netdev,
1865 netdev_dump_queues_cb *cb, void *aux)
1867 struct netdev_dev_linux *netdev_dev =
1868 netdev_dev_linux_cast(netdev_get_dev(netdev));
1869 struct tc_queue *queue;
1870 struct shash details;
1874 error = tc_query_qdisc(netdev);
1877 } else if (!netdev_dev->tc->ops->class_get) {
1882 shash_init(&details);
1883 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1884 shash_clear(&details);
1886 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1888 (*cb)(queue->queue_id, &details, aux);
1893 shash_destroy(&details);
1899 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1900 netdev_dump_queue_stats_cb *cb, void *aux)
1902 struct netdev_dev_linux *netdev_dev =
1903 netdev_dev_linux_cast(netdev_get_dev(netdev));
1904 struct nl_dump dump;
1909 error = tc_query_qdisc(netdev);
1912 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1917 if (!start_queue_dump(netdev, &dump)) {
1920 while (nl_dump_next(&dump, &msg)) {
1921 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1927 error = nl_dump_done(&dump);
1928 return error ? error : last_error;
1932 netdev_linux_get_in4(const struct netdev *netdev_,
1933 struct in_addr *address, struct in_addr *netmask)
1935 struct netdev_dev_linux *netdev_dev =
1936 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1938 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1941 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1942 SIOCGIFADDR, "SIOCGIFADDR");
1947 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1948 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1953 netdev_dev->cache_valid |= VALID_IN4;
1955 *address = netdev_dev->address;
1956 *netmask = netdev_dev->netmask;
1957 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1961 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1962 struct in_addr netmask)
1964 struct netdev_dev_linux *netdev_dev =
1965 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1968 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1970 netdev_dev->cache_valid |= VALID_IN4;
1971 netdev_dev->address = address;
1972 netdev_dev->netmask = netmask;
1973 if (address.s_addr != INADDR_ANY) {
1974 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1975 "SIOCSIFNETMASK", netmask);
1982 parse_if_inet6_line(const char *line,
1983 struct in6_addr *in6, char ifname[16 + 1])
1985 uint8_t *s6 = in6->s6_addr;
1986 #define X8 "%2"SCNx8
1988 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1989 "%*x %*x %*x %*x %16s\n",
1990 &s6[0], &s6[1], &s6[2], &s6[3],
1991 &s6[4], &s6[5], &s6[6], &s6[7],
1992 &s6[8], &s6[9], &s6[10], &s6[11],
1993 &s6[12], &s6[13], &s6[14], &s6[15],
1997 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1998 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2000 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2002 struct netdev_dev_linux *netdev_dev =
2003 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2004 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2008 netdev_dev->in6 = in6addr_any;
2010 file = fopen("/proc/net/if_inet6", "r");
2012 const char *name = netdev_get_name(netdev_);
2013 while (fgets(line, sizeof line, file)) {
2014 struct in6_addr in6_tmp;
2015 char ifname[16 + 1];
2016 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2017 && !strcmp(name, ifname))
2019 netdev_dev->in6 = in6_tmp;
2025 netdev_dev->cache_valid |= VALID_IN6;
2027 *in6 = netdev_dev->in6;
2032 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2034 struct sockaddr_in sin;
2035 memset(&sin, 0, sizeof sin);
2036 sin.sin_family = AF_INET;
2037 sin.sin_addr = addr;
2040 memset(sa, 0, sizeof *sa);
2041 memcpy(sa, &sin, sizeof sin);
2045 do_set_addr(struct netdev *netdev,
2046 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2049 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2050 make_in4_sockaddr(&ifr.ifr_addr, addr);
2052 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2056 /* Adds 'router' as a default IP gateway. */
2058 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2060 struct in_addr any = { INADDR_ANY };
2064 memset(&rt, 0, sizeof rt);
2065 make_in4_sockaddr(&rt.rt_dst, any);
2066 make_in4_sockaddr(&rt.rt_gateway, router);
2067 make_in4_sockaddr(&rt.rt_genmask, any);
2068 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2069 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2071 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2077 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2080 static const char fn[] = "/proc/net/route";
2085 *netdev_name = NULL;
2086 stream = fopen(fn, "r");
2087 if (stream == NULL) {
2088 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2093 while (fgets(line, sizeof line, stream)) {
2096 ovs_be32 dest, gateway, mask;
2097 int refcnt, metric, mtu;
2098 unsigned int flags, use, window, irtt;
2101 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2103 iface, &dest, &gateway, &flags, &refcnt,
2104 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2106 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2110 if (!(flags & RTF_UP)) {
2111 /* Skip routes that aren't up. */
2115 /* The output of 'dest', 'mask', and 'gateway' were given in
2116 * network byte order, so we don't need need any endian
2117 * conversions here. */
2118 if ((dest & mask) == (host->s_addr & mask)) {
2120 /* The host is directly reachable. */
2121 next_hop->s_addr = 0;
2123 /* To reach the host, we must go through a gateway. */
2124 next_hop->s_addr = gateway;
2126 *netdev_name = xstrdup(iface);
2138 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2140 struct ethtool_drvinfo drvinfo;
2143 memset(&drvinfo, 0, sizeof drvinfo);
2144 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2145 (struct ethtool_cmd *)&drvinfo,
2147 "ETHTOOL_GDRVINFO");
2149 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2150 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2151 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2157 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2158 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2159 * returns 0. Otherwise, it returns a positive errno value; in particular,
2160 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2162 netdev_linux_arp_lookup(const struct netdev *netdev,
2163 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2166 struct sockaddr_in sin;
2169 memset(&r, 0, sizeof r);
2170 memset(&sin, 0, sizeof sin);
2171 sin.sin_family = AF_INET;
2172 sin.sin_addr.s_addr = ip;
2174 memcpy(&r.arp_pa, &sin, sizeof sin);
2175 r.arp_ha.sa_family = ARPHRD_ETHER;
2177 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2178 COVERAGE_INC(netdev_arp_lookup);
2179 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2181 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2182 } else if (retval != ENXIO) {
2183 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2184 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2190 nd_to_iff_flags(enum netdev_flags nd)
2193 if (nd & NETDEV_UP) {
2196 if (nd & NETDEV_PROMISC) {
2203 iff_to_nd_flags(int iff)
2205 enum netdev_flags nd = 0;
2209 if (iff & IFF_PROMISC) {
2210 nd |= NETDEV_PROMISC;
2216 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2217 enum netdev_flags on, enum netdev_flags *old_flagsp)
2219 int old_flags, new_flags;
2222 error = get_flags(netdev, &old_flags);
2224 *old_flagsp = iff_to_nd_flags(old_flags);
2225 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2226 if (new_flags != old_flags) {
2227 error = set_flags(netdev, new_flags);
2234 netdev_linux_change_seq(const struct netdev *netdev)
2236 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2239 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2243 netdev_linux_init, \
2245 netdev_linux_wait, \
2248 netdev_linux_destroy, \
2249 NULL, /* set_config */ \
2251 netdev_linux_open, \
2252 netdev_linux_close, \
2256 netdev_linux_recv, \
2257 netdev_linux_recv_wait, \
2258 netdev_linux_drain, \
2260 netdev_linux_send, \
2261 netdev_linux_send_wait, \
2263 netdev_linux_set_etheraddr, \
2264 netdev_linux_get_etheraddr, \
2265 netdev_linux_get_mtu, \
2266 netdev_linux_get_ifindex, \
2267 netdev_linux_get_carrier, \
2268 netdev_linux_set_miimon_interval, \
2269 netdev_linux_get_stats, \
2272 netdev_linux_get_features, \
2273 netdev_linux_set_advertisements, \
2274 netdev_linux_get_vlan_vid, \
2276 netdev_linux_set_policing, \
2277 netdev_linux_get_qos_types, \
2278 netdev_linux_get_qos_capabilities, \
2279 netdev_linux_get_qos, \
2280 netdev_linux_set_qos, \
2281 netdev_linux_get_queue, \
2282 netdev_linux_set_queue, \
2283 netdev_linux_delete_queue, \
2284 netdev_linux_get_queue_stats, \
2285 netdev_linux_dump_queues, \
2286 netdev_linux_dump_queue_stats, \
2288 netdev_linux_get_in4, \
2289 netdev_linux_set_in4, \
2290 netdev_linux_get_in6, \
2291 netdev_linux_add_router, \
2292 netdev_linux_get_next_hop, \
2293 netdev_linux_get_status, \
2294 netdev_linux_arp_lookup, \
2296 netdev_linux_update_flags, \
2298 netdev_linux_change_seq \
2301 const struct netdev_class netdev_linux_class =
2304 netdev_linux_create,
2305 netdev_linux_enumerate,
2306 NULL); /* set_stats */
2308 const struct netdev_class netdev_tap_class =
2311 netdev_linux_create_tap,
2312 NULL, /* enumerate */
2313 NULL); /* set_stats */
2315 const struct netdev_class netdev_internal_class =
2318 netdev_linux_create,
2319 NULL, /* enumerate */
2320 netdev_vport_set_stats);
2322 /* HTB traffic control class. */
2324 #define HTB_N_QUEUES 0xf000
2328 unsigned int max_rate; /* In bytes/s. */
2332 struct tc_queue tc_queue;
2333 unsigned int min_rate; /* In bytes/s. */
2334 unsigned int max_rate; /* In bytes/s. */
2335 unsigned int burst; /* In bytes. */
2336 unsigned int priority; /* Lower values are higher priorities. */
2340 htb_get__(const struct netdev *netdev)
2342 struct netdev_dev_linux *netdev_dev =
2343 netdev_dev_linux_cast(netdev_get_dev(netdev));
2344 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2348 htb_install__(struct netdev *netdev, uint64_t max_rate)
2350 struct netdev_dev_linux *netdev_dev =
2351 netdev_dev_linux_cast(netdev_get_dev(netdev));
2354 htb = xmalloc(sizeof *htb);
2355 tc_init(&htb->tc, &tc_ops_htb);
2356 htb->max_rate = max_rate;
2358 netdev_dev->tc = &htb->tc;
2361 /* Create an HTB qdisc.
2363 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2365 htb_setup_qdisc__(struct netdev *netdev)
2368 struct tc_htb_glob opt;
2369 struct ofpbuf request;
2370 struct tcmsg *tcmsg;
2372 tc_del_qdisc(netdev);
2374 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2375 NLM_F_EXCL | NLM_F_CREATE, &request);
2379 tcmsg->tcm_handle = tc_make_handle(1, 0);
2380 tcmsg->tcm_parent = TC_H_ROOT;
2382 nl_msg_put_string(&request, TCA_KIND, "htb");
2384 memset(&opt, 0, sizeof opt);
2385 opt.rate2quantum = 10;
2389 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2390 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2391 nl_msg_end_nested(&request, opt_offset);
2393 return tc_transact(&request, NULL);
2396 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2397 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2399 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2400 unsigned int parent, struct htb_class *class)
2403 struct tc_htb_opt opt;
2404 struct ofpbuf request;
2405 struct tcmsg *tcmsg;
2409 netdev_get_mtu(netdev, &mtu);
2410 if (mtu == INT_MAX) {
2411 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2412 netdev_get_name(netdev));
2416 memset(&opt, 0, sizeof opt);
2417 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2418 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2419 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2420 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2421 opt.prio = class->priority;
2423 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2427 tcmsg->tcm_handle = handle;
2428 tcmsg->tcm_parent = parent;
2430 nl_msg_put_string(&request, TCA_KIND, "htb");
2431 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2432 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2433 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2434 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2435 nl_msg_end_nested(&request, opt_offset);
2437 error = tc_transact(&request, NULL);
2439 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2440 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2441 netdev_get_name(netdev),
2442 tc_get_major(handle), tc_get_minor(handle),
2443 tc_get_major(parent), tc_get_minor(parent),
2444 class->min_rate, class->max_rate,
2445 class->burst, class->priority, strerror(error));
2450 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2451 * description of them into 'details'. The description complies with the
2452 * specification given in the vswitch database documentation for linux-htb
2455 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2457 static const struct nl_policy tca_htb_policy[] = {
2458 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2459 .min_len = sizeof(struct tc_htb_opt) },
2462 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2463 const struct tc_htb_opt *htb;
2465 if (!nl_parse_nested(nl_options, tca_htb_policy,
2466 attrs, ARRAY_SIZE(tca_htb_policy))) {
2467 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2471 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2472 class->min_rate = htb->rate.rate;
2473 class->max_rate = htb->ceil.rate;
2474 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2475 class->priority = htb->prio;
2480 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2481 struct htb_class *options,
2482 struct netdev_queue_stats *stats)
2484 struct nlattr *nl_options;
2485 unsigned int handle;
2488 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2489 if (!error && queue_id) {
2490 unsigned int major = tc_get_major(handle);
2491 unsigned int minor = tc_get_minor(handle);
2492 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2493 *queue_id = minor - 1;
2498 if (!error && options) {
2499 error = htb_parse_tca_options__(nl_options, options);
2505 htb_parse_qdisc_details__(struct netdev *netdev,
2506 const struct shash *details, struct htb_class *hc)
2508 const char *max_rate_s;
2510 max_rate_s = shash_find_data(details, "max-rate");
2511 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2512 if (!hc->max_rate) {
2515 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2516 hc->max_rate = netdev_features_to_bps(current) / 8;
2518 hc->min_rate = hc->max_rate;
2524 htb_parse_class_details__(struct netdev *netdev,
2525 const struct shash *details, struct htb_class *hc)
2527 const struct htb *htb = htb_get__(netdev);
2528 const char *min_rate_s = shash_find_data(details, "min-rate");
2529 const char *max_rate_s = shash_find_data(details, "max-rate");
2530 const char *burst_s = shash_find_data(details, "burst");
2531 const char *priority_s = shash_find_data(details, "priority");
2534 netdev_get_mtu(netdev, &mtu);
2535 if (mtu == INT_MAX) {
2536 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2537 netdev_get_name(netdev));
2541 /* HTB requires at least an mtu sized min-rate to send any traffic even
2542 * on uncongested links. */
2543 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2544 hc->min_rate = MAX(hc->min_rate, mtu);
2545 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2548 hc->max_rate = (max_rate_s
2549 ? strtoull(max_rate_s, NULL, 10) / 8
2551 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2552 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2556 * According to hints in the documentation that I've read, it is important
2557 * that 'burst' be at least as big as the largest frame that might be
2558 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2559 * but having it a bit too small is a problem. Since netdev_get_mtu()
2560 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2561 * the MTU. We actually add 64, instead of 14, as a guard against
2562 * additional headers get tacked on somewhere that we're not aware of. */
2563 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2564 hc->burst = MAX(hc->burst, mtu + 64);
2567 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2573 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2574 unsigned int parent, struct htb_class *options,
2575 struct netdev_queue_stats *stats)
2577 struct ofpbuf *reply;
2580 error = tc_query_class(netdev, handle, parent, &reply);
2582 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2583 ofpbuf_delete(reply);
2589 htb_tc_install(struct netdev *netdev, const struct shash *details)
2593 error = htb_setup_qdisc__(netdev);
2595 struct htb_class hc;
2597 htb_parse_qdisc_details__(netdev, details, &hc);
2598 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2599 tc_make_handle(1, 0), &hc);
2601 htb_install__(netdev, hc.max_rate);
2607 static struct htb_class *
2608 htb_class_cast__(const struct tc_queue *queue)
2610 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2614 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2615 const struct htb_class *hc)
2617 struct htb *htb = htb_get__(netdev);
2618 size_t hash = hash_int(queue_id, 0);
2619 struct tc_queue *queue;
2620 struct htb_class *hcp;
2622 queue = tc_find_queue__(netdev, queue_id, hash);
2624 hcp = htb_class_cast__(queue);
2626 hcp = xmalloc(sizeof *hcp);
2627 queue = &hcp->tc_queue;
2628 queue->queue_id = queue_id;
2629 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2632 hcp->min_rate = hc->min_rate;
2633 hcp->max_rate = hc->max_rate;
2634 hcp->burst = hc->burst;
2635 hcp->priority = hc->priority;
2639 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2642 struct nl_dump dump;
2643 struct htb_class hc;
2645 /* Get qdisc options. */
2647 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2648 htb_install__(netdev, hc.max_rate);
2651 if (!start_queue_dump(netdev, &dump)) {
2654 while (nl_dump_next(&dump, &msg)) {
2655 unsigned int queue_id;
2657 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2658 htb_update_queue__(netdev, queue_id, &hc);
2661 nl_dump_done(&dump);
2667 htb_tc_destroy(struct tc *tc)
2669 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2670 struct htb_class *hc, *next;
2672 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2673 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2681 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2683 const struct htb *htb = htb_get__(netdev);
2684 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2689 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2691 struct htb_class hc;
2694 htb_parse_qdisc_details__(netdev, details, &hc);
2695 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2696 tc_make_handle(1, 0), &hc);
2698 htb_get__(netdev)->max_rate = hc.max_rate;
2704 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2705 const struct tc_queue *queue, struct shash *details)
2707 const struct htb_class *hc = htb_class_cast__(queue);
2709 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2710 if (hc->min_rate != hc->max_rate) {
2711 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2713 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2715 shash_add(details, "priority", xasprintf("%u", hc->priority));
2721 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2722 const struct shash *details)
2724 struct htb_class hc;
2727 error = htb_parse_class_details__(netdev, details, &hc);
2732 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2733 tc_make_handle(1, 0xfffe), &hc);
2738 htb_update_queue__(netdev, queue_id, &hc);
2743 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2745 struct htb_class *hc = htb_class_cast__(queue);
2746 struct htb *htb = htb_get__(netdev);
2749 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2751 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2758 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2759 struct netdev_queue_stats *stats)
2761 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2762 tc_make_handle(1, 0xfffe), NULL, stats);
2766 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2767 const struct ofpbuf *nlmsg,
2768 netdev_dump_queue_stats_cb *cb, void *aux)
2770 struct netdev_queue_stats stats;
2771 unsigned int handle, major, minor;
2774 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2779 major = tc_get_major(handle);
2780 minor = tc_get_minor(handle);
2781 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2782 (*cb)(minor - 1, &stats, aux);
2787 static const struct tc_ops tc_ops_htb = {
2788 "htb", /* linux_name */
2789 "linux-htb", /* ovs_name */
2790 HTB_N_QUEUES, /* n_queues */
2799 htb_class_get_stats,
2800 htb_class_dump_stats
2803 /* "linux-hfsc" traffic control class. */
2805 #define HFSC_N_QUEUES 0xf000
2813 struct tc_queue tc_queue;
2818 static struct hfsc *
2819 hfsc_get__(const struct netdev *netdev)
2821 struct netdev_dev_linux *netdev_dev;
2822 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2823 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2826 static struct hfsc_class *
2827 hfsc_class_cast__(const struct tc_queue *queue)
2829 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2833 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2835 struct netdev_dev_linux * netdev_dev;
2838 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2839 hfsc = xmalloc(sizeof *hfsc);
2840 tc_init(&hfsc->tc, &tc_ops_hfsc);
2841 hfsc->max_rate = max_rate;
2842 netdev_dev->tc = &hfsc->tc;
2846 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2847 const struct hfsc_class *hc)
2851 struct hfsc_class *hcp;
2852 struct tc_queue *queue;
2854 hfsc = hfsc_get__(netdev);
2855 hash = hash_int(queue_id, 0);
2857 queue = tc_find_queue__(netdev, queue_id, hash);
2859 hcp = hfsc_class_cast__(queue);
2861 hcp = xmalloc(sizeof *hcp);
2862 queue = &hcp->tc_queue;
2863 queue->queue_id = queue_id;
2864 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2867 hcp->min_rate = hc->min_rate;
2868 hcp->max_rate = hc->max_rate;
2872 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2874 const struct tc_service_curve *rsc, *fsc, *usc;
2875 static const struct nl_policy tca_hfsc_policy[] = {
2877 .type = NL_A_UNSPEC,
2879 .min_len = sizeof(struct tc_service_curve),
2882 .type = NL_A_UNSPEC,
2884 .min_len = sizeof(struct tc_service_curve),
2887 .type = NL_A_UNSPEC,
2889 .min_len = sizeof(struct tc_service_curve),
2892 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2894 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2895 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2896 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2900 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2901 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2902 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2904 if (rsc->m1 != 0 || rsc->d != 0 ||
2905 fsc->m1 != 0 || fsc->d != 0 ||
2906 usc->m1 != 0 || usc->d != 0) {
2907 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2908 "Non-linear service curves are not supported.");
2912 if (rsc->m2 != fsc->m2) {
2913 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2914 "Real-time service curves are not supported ");
2918 if (rsc->m2 > usc->m2) {
2919 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2920 "Min-rate service curve is greater than "
2921 "the max-rate service curve.");
2925 class->min_rate = fsc->m2;
2926 class->max_rate = usc->m2;
2931 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2932 struct hfsc_class *options,
2933 struct netdev_queue_stats *stats)
2936 unsigned int handle;
2937 struct nlattr *nl_options;
2939 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2945 unsigned int major, minor;
2947 major = tc_get_major(handle);
2948 minor = tc_get_minor(handle);
2949 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2950 *queue_id = minor - 1;
2957 error = hfsc_parse_tca_options__(nl_options, options);
2964 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2965 unsigned int parent, struct hfsc_class *options,
2966 struct netdev_queue_stats *stats)
2969 struct ofpbuf *reply;
2971 error = tc_query_class(netdev, handle, parent, &reply);
2976 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2977 ofpbuf_delete(reply);
2982 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2983 struct hfsc_class *class)
2986 const char *max_rate_s;
2988 max_rate_s = shash_find_data(details, "max-rate");
2989 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2994 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2995 max_rate = netdev_features_to_bps(current) / 8;
2998 class->min_rate = max_rate;
2999 class->max_rate = max_rate;
3003 hfsc_parse_class_details__(struct netdev *netdev,
3004 const struct shash *details,
3005 struct hfsc_class * class)
3007 const struct hfsc *hfsc;
3008 uint32_t min_rate, max_rate;
3009 const char *min_rate_s, *max_rate_s;
3011 hfsc = hfsc_get__(netdev);
3012 min_rate_s = shash_find_data(details, "min-rate");
3013 max_rate_s = shash_find_data(details, "max-rate");
3015 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3016 min_rate = MAX(min_rate, 1);
3017 min_rate = MIN(min_rate, hfsc->max_rate);
3019 max_rate = (max_rate_s
3020 ? strtoull(max_rate_s, NULL, 10) / 8
3022 max_rate = MAX(max_rate, min_rate);
3023 max_rate = MIN(max_rate, hfsc->max_rate);
3025 class->min_rate = min_rate;
3026 class->max_rate = max_rate;
3031 /* Create an HFSC qdisc.
3033 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3035 hfsc_setup_qdisc__(struct netdev * netdev)
3037 struct tcmsg *tcmsg;
3038 struct ofpbuf request;
3039 struct tc_hfsc_qopt opt;
3041 tc_del_qdisc(netdev);
3043 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3044 NLM_F_EXCL | NLM_F_CREATE, &request);
3050 tcmsg->tcm_handle = tc_make_handle(1, 0);
3051 tcmsg->tcm_parent = TC_H_ROOT;
3053 memset(&opt, 0, sizeof opt);
3056 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3057 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3059 return tc_transact(&request, NULL);
3062 /* Create an HFSC class.
3064 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3065 * sc rate <min_rate> ul rate <max_rate>" */
3067 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3068 unsigned int parent, struct hfsc_class *class)
3072 struct tcmsg *tcmsg;
3073 struct ofpbuf request;
3074 struct tc_service_curve min, max;
3076 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3082 tcmsg->tcm_handle = handle;
3083 tcmsg->tcm_parent = parent;
3087 min.m2 = class->min_rate;
3091 max.m2 = class->max_rate;
3093 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3094 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3095 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3096 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3097 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3098 nl_msg_end_nested(&request, opt_offset);
3100 error = tc_transact(&request, NULL);
3102 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3103 "min-rate %ubps, max-rate %ubps (%s)",
3104 netdev_get_name(netdev),
3105 tc_get_major(handle), tc_get_minor(handle),
3106 tc_get_major(parent), tc_get_minor(parent),
3107 class->min_rate, class->max_rate, strerror(error));
3114 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3117 struct hfsc_class class;
3119 error = hfsc_setup_qdisc__(netdev);
3125 hfsc_parse_qdisc_details__(netdev, details, &class);
3126 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3127 tc_make_handle(1, 0), &class);
3133 hfsc_install__(netdev, class.max_rate);
3138 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3141 struct nl_dump dump;
3142 struct hfsc_class hc;
3145 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3146 hfsc_install__(netdev, hc.max_rate);
3148 if (!start_queue_dump(netdev, &dump)) {
3152 while (nl_dump_next(&dump, &msg)) {
3153 unsigned int queue_id;
3155 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3156 hfsc_update_queue__(netdev, queue_id, &hc);
3160 nl_dump_done(&dump);
3165 hfsc_tc_destroy(struct tc *tc)
3168 struct hfsc_class *hc, *next;
3170 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3172 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3173 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3182 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3184 const struct hfsc *hfsc;
3185 hfsc = hfsc_get__(netdev);
3186 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3191 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3194 struct hfsc_class class;
3196 hfsc_parse_qdisc_details__(netdev, details, &class);
3197 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3198 tc_make_handle(1, 0), &class);
3201 hfsc_get__(netdev)->max_rate = class.max_rate;
3208 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3209 const struct tc_queue *queue, struct shash *details)
3211 const struct hfsc_class *hc;
3213 hc = hfsc_class_cast__(queue);
3214 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3215 if (hc->min_rate != hc->max_rate) {
3216 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3222 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3223 const struct shash *details)
3226 struct hfsc_class class;
3228 error = hfsc_parse_class_details__(netdev, details, &class);
3233 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3234 tc_make_handle(1, 0xfffe), &class);
3239 hfsc_update_queue__(netdev, queue_id, &class);
3244 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3248 struct hfsc_class *hc;
3250 hc = hfsc_class_cast__(queue);
3251 hfsc = hfsc_get__(netdev);
3253 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3255 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3262 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3263 struct netdev_queue_stats *stats)
3265 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3266 tc_make_handle(1, 0xfffe), NULL, stats);
3270 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3271 const struct ofpbuf *nlmsg,
3272 netdev_dump_queue_stats_cb *cb, void *aux)
3274 struct netdev_queue_stats stats;
3275 unsigned int handle, major, minor;
3278 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3283 major = tc_get_major(handle);
3284 minor = tc_get_minor(handle);
3285 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3286 (*cb)(minor - 1, &stats, aux);
3291 static const struct tc_ops tc_ops_hfsc = {
3292 "hfsc", /* linux_name */
3293 "linux-hfsc", /* ovs_name */
3294 HFSC_N_QUEUES, /* n_queues */
3295 hfsc_tc_install, /* tc_install */
3296 hfsc_tc_load, /* tc_load */
3297 hfsc_tc_destroy, /* tc_destroy */
3298 hfsc_qdisc_get, /* qdisc_get */
3299 hfsc_qdisc_set, /* qdisc_set */
3300 hfsc_class_get, /* class_get */
3301 hfsc_class_set, /* class_set */
3302 hfsc_class_delete, /* class_delete */
3303 hfsc_class_get_stats, /* class_get_stats */
3304 hfsc_class_dump_stats /* class_dump_stats */
3307 /* "linux-default" traffic control class.
3309 * This class represents the default, unnamed Linux qdisc. It corresponds to
3310 * the "" (empty string) QoS type in the OVS database. */
3313 default_install__(struct netdev *netdev)
3315 struct netdev_dev_linux *netdev_dev =
3316 netdev_dev_linux_cast(netdev_get_dev(netdev));
3317 static struct tc *tc;
3320 tc = xmalloc(sizeof *tc);
3321 tc_init(tc, &tc_ops_default);
3323 netdev_dev->tc = tc;
3327 default_tc_install(struct netdev *netdev,
3328 const struct shash *details OVS_UNUSED)
3330 default_install__(netdev);
3335 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3337 default_install__(netdev);
3341 static const struct tc_ops tc_ops_default = {
3342 NULL, /* linux_name */
3347 NULL, /* tc_destroy */
3348 NULL, /* qdisc_get */
3349 NULL, /* qdisc_set */
3350 NULL, /* class_get */
3351 NULL, /* class_set */
3352 NULL, /* class_delete */
3353 NULL, /* class_get_stats */
3354 NULL /* class_dump_stats */
3357 /* "linux-other" traffic control class.
3362 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3364 struct netdev_dev_linux *netdev_dev =
3365 netdev_dev_linux_cast(netdev_get_dev(netdev));
3366 static struct tc *tc;
3369 tc = xmalloc(sizeof *tc);
3370 tc_init(tc, &tc_ops_other);
3372 netdev_dev->tc = tc;
3376 static const struct tc_ops tc_ops_other = {
3377 NULL, /* linux_name */
3378 "linux-other", /* ovs_name */
3380 NULL, /* tc_install */
3382 NULL, /* tc_destroy */
3383 NULL, /* qdisc_get */
3384 NULL, /* qdisc_set */
3385 NULL, /* class_get */
3386 NULL, /* class_set */
3387 NULL, /* class_delete */
3388 NULL, /* class_get_stats */
3389 NULL /* class_dump_stats */
3392 /* Traffic control. */
3394 /* Number of kernel "tc" ticks per second. */
3395 static double ticks_per_s;
3397 /* Number of kernel "jiffies" per second. This is used for the purpose of
3398 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3399 * one jiffy's worth of data.
3401 * There are two possibilities here:
3403 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3404 * approximate range of 100 to 1024. That means that we really need to
3405 * make sure that the qdisc can buffer that much data.
3407 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3408 * has finely granular timers and there's no need to fudge additional room
3409 * for buffers. (There's no extra effort needed to implement that: the
3410 * large 'buffer_hz' is used as a divisor, so practically any number will
3411 * come out as 0 in the division. Small integer results in the case of
3412 * really high dividends won't have any real effect anyhow.)
3414 static unsigned int buffer_hz;
3416 /* Returns tc handle 'major':'minor'. */
3418 tc_make_handle(unsigned int major, unsigned int minor)
3420 return TC_H_MAKE(major << 16, minor);
3423 /* Returns the major number from 'handle'. */
3425 tc_get_major(unsigned int handle)
3427 return TC_H_MAJ(handle) >> 16;
3430 /* Returns the minor number from 'handle'. */
3432 tc_get_minor(unsigned int handle)
3434 return TC_H_MIN(handle);
3437 static struct tcmsg *
3438 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3439 struct ofpbuf *request)
3441 struct tcmsg *tcmsg;
3445 error = get_ifindex(netdev, &ifindex);
3450 ofpbuf_init(request, 512);
3451 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3452 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3453 tcmsg->tcm_family = AF_UNSPEC;
3454 tcmsg->tcm_ifindex = ifindex;
3455 /* Caller should fill in tcmsg->tcm_handle. */
3456 /* Caller should fill in tcmsg->tcm_parent. */
3462 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3464 int error = nl_sock_transact(rtnl_sock, request, replyp);
3465 ofpbuf_uninit(request);
3472 /* The values in psched are not individually very meaningful, but they are
3473 * important. The tables below show some values seen in the wild.
3477 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3478 * (Before that, there are hints that it was 1000000000.)
3480 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3484 * -----------------------------------
3485 * [1] 000c8000 000f4240 000f4240 00000064
3486 * [2] 000003e8 00000400 000f4240 3b9aca00
3487 * [3] 000003e8 00000400 000f4240 3b9aca00
3488 * [4] 000003e8 00000400 000f4240 00000064
3489 * [5] 000003e8 00000040 000f4240 3b9aca00
3490 * [6] 000003e8 00000040 000f4240 000000f9
3492 * a b c d ticks_per_s buffer_hz
3493 * ------- --------- ---------- ------------- ----------- -------------
3494 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3495 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3496 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3497 * [4] 1,000 1,024 1,000,000 100 976,562 100
3498 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3499 * [6] 1,000 64 1,000,000 249 15,625,000 249
3501 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3502 * [2] 2.6.26-1-686-bigmem from Debian lenny
3503 * [3] 2.6.26-2-sparc64 from Debian lenny
3504 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3505 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3506 * [6] 2.6.34 from kernel.org on KVM
3508 static const char fn[] = "/proc/net/psched";
3509 unsigned int a, b, c, d;
3515 stream = fopen(fn, "r");
3517 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3521 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3522 VLOG_WARN("%s: read failed", fn);
3526 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3530 VLOG_WARN("%s: invalid scheduler parameters", fn);
3534 ticks_per_s = (double) a * c / b;
3538 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3541 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3544 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3545 * rate of 'rate' bytes per second. */
3547 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3552 return (rate * ticks) / ticks_per_s;
3555 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3556 * rate of 'rate' bytes per second. */
3558 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3563 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3566 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3567 * a transmission rate of 'rate' bytes per second. */
3569 tc_buffer_per_jiffy(unsigned int rate)
3574 return rate / buffer_hz;
3577 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3578 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3579 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3580 * stores NULL into it if it is absent.
3582 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3585 * Returns 0 if successful, otherwise a positive errno value. */
3587 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3588 struct nlattr **options)
3590 static const struct nl_policy tca_policy[] = {
3591 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3592 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3594 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3596 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3597 tca_policy, ta, ARRAY_SIZE(ta))) {
3598 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3603 *kind = nl_attr_get_string(ta[TCA_KIND]);
3607 *options = ta[TCA_OPTIONS];
3622 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3623 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3624 * into '*options', and its queue statistics into '*stats'. Any of the output
3625 * arguments may be null.
3627 * Returns 0 if successful, otherwise a positive errno value. */
3629 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3630 struct nlattr **options, struct netdev_queue_stats *stats)
3632 static const struct nl_policy tca_policy[] = {
3633 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3634 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3636 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3638 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3639 tca_policy, ta, ARRAY_SIZE(ta))) {
3640 VLOG_WARN_RL(&rl, "failed to parse class message");
3645 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3646 *handlep = tc->tcm_handle;
3650 *options = ta[TCA_OPTIONS];
3654 const struct gnet_stats_queue *gsq;
3655 struct gnet_stats_basic gsb;
3657 static const struct nl_policy stats_policy[] = {
3658 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3659 .min_len = sizeof gsb },
3660 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3661 .min_len = sizeof *gsq },
3663 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3665 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3666 sa, ARRAY_SIZE(sa))) {
3667 VLOG_WARN_RL(&rl, "failed to parse class stats");
3671 /* Alignment issues screw up the length of struct gnet_stats_basic on
3672 * some arch/bitsize combinations. Newer versions of Linux have a
3673 * struct gnet_stats_basic_packed, but we can't depend on that. The
3674 * easiest thing to do is just to make a copy. */
3675 memset(&gsb, 0, sizeof gsb);
3676 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3677 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3678 stats->tx_bytes = gsb.bytes;
3679 stats->tx_packets = gsb.packets;
3681 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3682 stats->tx_errors = gsq->drops;
3692 memset(stats, 0, sizeof *stats);
3697 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3700 tc_query_class(const struct netdev *netdev,
3701 unsigned int handle, unsigned int parent,
3702 struct ofpbuf **replyp)
3704 struct ofpbuf request;
3705 struct tcmsg *tcmsg;
3708 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3712 tcmsg->tcm_handle = handle;
3713 tcmsg->tcm_parent = parent;
3715 error = tc_transact(&request, replyp);
3717 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3718 netdev_get_name(netdev),
3719 tc_get_major(handle), tc_get_minor(handle),
3720 tc_get_major(parent), tc_get_minor(parent),
3726 /* Equivalent to "tc class del dev <name> handle <handle>". */
3728 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3730 struct ofpbuf request;
3731 struct tcmsg *tcmsg;
3734 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3738 tcmsg->tcm_handle = handle;
3739 tcmsg->tcm_parent = 0;
3741 error = tc_transact(&request, NULL);
3743 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3744 netdev_get_name(netdev),
3745 tc_get_major(handle), tc_get_minor(handle),
3751 /* Equivalent to "tc qdisc del dev <name> root". */
3753 tc_del_qdisc(struct netdev *netdev)
3755 struct netdev_dev_linux *netdev_dev =
3756 netdev_dev_linux_cast(netdev_get_dev(netdev));
3757 struct ofpbuf request;
3758 struct tcmsg *tcmsg;
3761 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3765 tcmsg->tcm_handle = tc_make_handle(1, 0);
3766 tcmsg->tcm_parent = TC_H_ROOT;
3768 error = tc_transact(&request, NULL);
3769 if (error == EINVAL) {
3770 /* EINVAL probably means that the default qdisc was in use, in which
3771 * case we've accomplished our purpose. */
3774 if (!error && netdev_dev->tc) {
3775 if (netdev_dev->tc->ops->tc_destroy) {
3776 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3778 netdev_dev->tc = NULL;
3783 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3784 * kernel to determine what they are. Returns 0 if successful, otherwise a
3785 * positive errno value. */
3787 tc_query_qdisc(const struct netdev *netdev)
3789 struct netdev_dev_linux *netdev_dev =
3790 netdev_dev_linux_cast(netdev_get_dev(netdev));
3791 struct ofpbuf request, *qdisc;
3792 const struct tc_ops *ops;
3793 struct tcmsg *tcmsg;
3797 if (netdev_dev->tc) {
3801 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3802 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3803 * 2.6.35 without that fix backported to it.
3805 * To avoid the OOPS, we must not make a request that would attempt to dump
3806 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3807 * few others. There are a few ways that I can see to do this, but most of
3808 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3809 * technique chosen here is to assume that any non-default qdisc that we
3810 * create will have a class with handle 1:0. The built-in qdiscs only have
3811 * a class with handle 0:0.
3813 * We could check for Linux 2.6.35+ and use a more straightforward method
3815 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3819 tcmsg->tcm_handle = tc_make_handle(1, 0);
3820 tcmsg->tcm_parent = 0;
3822 /* Figure out what tc class to instantiate. */
3823 error = tc_transact(&request, &qdisc);
3827 error = tc_parse_qdisc(qdisc, &kind, NULL);
3829 ops = &tc_ops_other;
3831 ops = tc_lookup_linux_name(kind);
3833 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3834 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3836 ops = &tc_ops_other;
3839 } else if (error == ENOENT) {
3840 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3841 * other entity that doesn't have a handle 1:0. We will assume
3842 * that it's the system default qdisc. */
3843 ops = &tc_ops_default;
3846 /* Who knows? Maybe the device got deleted. */
3847 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3848 netdev_get_name(netdev), strerror(error));
3849 ops = &tc_ops_other;
3852 /* Instantiate it. */
3853 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3854 assert((load_error == 0) == (netdev_dev->tc != NULL));
3855 ofpbuf_delete(qdisc);
3857 return error ? error : load_error;
3860 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3861 approximate the time to transmit packets of various lengths. For an MTU of
3862 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3863 represents two possible packet lengths; for a MTU of 513 through 1024, four
3864 possible lengths; and so on.
3866 Returns, for the specified 'mtu', the number of bits that packet lengths
3867 need to be shifted right to fit within such a 256-entry table. */
3869 tc_calc_cell_log(unsigned int mtu)
3874 mtu = ETH_PAYLOAD_MAX;
3876 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3878 for (cell_log = 0; mtu >= 256; cell_log++) {
3885 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3888 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3890 memset(rate, 0, sizeof *rate);
3891 rate->cell_log = tc_calc_cell_log(mtu);
3892 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3893 /* rate->cell_align = 0; */ /* distro headers. */
3894 rate->mpu = ETH_TOTAL_MIN;
3898 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3899 * attribute of the specified "type".
3901 * See tc_calc_cell_log() above for a description of "rtab"s. */
3903 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3908 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3909 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3910 unsigned packet_size = (i + 1) << rate->cell_log;
3911 if (packet_size < rate->mpu) {
3912 packet_size = rate->mpu;
3914 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3918 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3919 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3920 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3923 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3925 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3926 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3929 /* Public utility functions. */
3931 #define COPY_NETDEV_STATS \
3932 dst->rx_packets = src->rx_packets; \
3933 dst->tx_packets = src->tx_packets; \
3934 dst->rx_bytes = src->rx_bytes; \
3935 dst->tx_bytes = src->tx_bytes; \
3936 dst->rx_errors = src->rx_errors; \
3937 dst->tx_errors = src->tx_errors; \
3938 dst->rx_dropped = src->rx_dropped; \
3939 dst->tx_dropped = src->tx_dropped; \
3940 dst->multicast = src->multicast; \
3941 dst->collisions = src->collisions; \
3942 dst->rx_length_errors = src->rx_length_errors; \
3943 dst->rx_over_errors = src->rx_over_errors; \
3944 dst->rx_crc_errors = src->rx_crc_errors; \
3945 dst->rx_frame_errors = src->rx_frame_errors; \
3946 dst->rx_fifo_errors = src->rx_fifo_errors; \
3947 dst->rx_missed_errors = src->rx_missed_errors; \
3948 dst->tx_aborted_errors = src->tx_aborted_errors; \
3949 dst->tx_carrier_errors = src->tx_carrier_errors; \
3950 dst->tx_fifo_errors = src->tx_fifo_errors; \
3951 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3952 dst->tx_window_errors = src->tx_window_errors
3954 /* Copies 'src' into 'dst', performing format conversion in the process. */
3956 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3957 const struct rtnl_link_stats *src)
3962 /* Copies 'src' into 'dst', performing format conversion in the process. */
3964 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3965 const struct rtnl_link_stats64 *src)
3970 /* Copies 'src' into 'dst', performing format conversion in the process. */
3972 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
3973 const struct netdev_stats *src)
3976 dst->rx_compressed = 0;
3977 dst->tx_compressed = 0;
3980 /* Utility functions. */
3983 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3985 /* Policy for RTNLGRP_LINK messages.
3987 * There are *many* more fields in these messages, but currently we only
3988 * care about these fields. */
3989 static const struct nl_policy rtnlgrp_link_policy[] = {
3990 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3991 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3992 .min_len = sizeof(struct rtnl_link_stats) },
3995 struct ofpbuf request;
3996 struct ofpbuf *reply;
3997 struct ifinfomsg *ifi;
3998 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4001 ofpbuf_init(&request, 0);
4002 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4003 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4004 ifi->ifi_family = PF_UNSPEC;
4005 ifi->ifi_index = ifindex;
4006 error = nl_sock_transact(rtnl_sock, &request, &reply);
4007 ofpbuf_uninit(&request);
4012 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4013 rtnlgrp_link_policy,
4014 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4015 ofpbuf_delete(reply);
4019 if (!attrs[IFLA_STATS]) {
4020 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4021 ofpbuf_delete(reply);
4025 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4027 ofpbuf_delete(reply);
4033 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4035 static const char fn[] = "/proc/net/dev";
4040 stream = fopen(fn, "r");
4042 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4047 while (fgets(line, sizeof line, stream)) {
4050 #define X64 "%"SCNu64
4053 X64 X64 X64 X64 X64 X64 X64 "%*u"
4054 X64 X64 X64 X64 X64 X64 X64 "%*u",
4060 &stats->rx_fifo_errors,
4061 &stats->rx_frame_errors,
4067 &stats->tx_fifo_errors,
4069 &stats->tx_carrier_errors) != 15) {
4070 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4071 } else if (!strcmp(devname, netdev_name)) {
4072 stats->rx_length_errors = UINT64_MAX;
4073 stats->rx_over_errors = UINT64_MAX;
4074 stats->rx_crc_errors = UINT64_MAX;
4075 stats->rx_missed_errors = UINT64_MAX;
4076 stats->tx_aborted_errors = UINT64_MAX;
4077 stats->tx_heartbeat_errors = UINT64_MAX;
4078 stats->tx_window_errors = UINT64_MAX;
4084 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4090 get_flags(const struct netdev *netdev, int *flags)
4095 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4097 *flags = ifr.ifr_flags;
4102 set_flags(struct netdev *netdev, int flags)
4106 ifr.ifr_flags = flags;
4107 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4112 do_get_ifindex(const char *netdev_name)
4116 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4117 COVERAGE_INC(netdev_get_ifindex);
4118 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4119 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4120 netdev_name, strerror(errno));
4123 return ifr.ifr_ifindex;
4127 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4129 struct netdev_dev_linux *netdev_dev =
4130 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4132 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4133 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4137 netdev_dev->cache_valid |= VALID_IFINDEX;
4138 netdev_dev->ifindex = ifindex;
4140 *ifindexp = netdev_dev->ifindex;
4145 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4150 memset(&ifr, 0, sizeof ifr);
4151 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4152 COVERAGE_INC(netdev_get_hwaddr);
4153 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4154 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4155 netdev_name, strerror(errno));
4158 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4159 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4160 VLOG_WARN("%s device has unknown hardware address family %d",
4161 netdev_name, hwaddr_family);
4163 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4168 set_etheraddr(const char *netdev_name, int hwaddr_family,
4169 const uint8_t mac[ETH_ADDR_LEN])
4173 memset(&ifr, 0, sizeof ifr);
4174 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4175 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4176 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4177 COVERAGE_INC(netdev_set_hwaddr);
4178 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4179 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4180 netdev_name, strerror(errno));
4187 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4188 int cmd, const char *cmd_name)
4192 memset(&ifr, 0, sizeof ifr);
4193 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4194 ifr.ifr_data = (caddr_t) ecmd;
4197 COVERAGE_INC(netdev_ethtool);
4198 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4201 if (errno != EOPNOTSUPP) {
4202 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4203 "failed: %s", cmd_name, name, strerror(errno));
4205 /* The device doesn't support this operation. That's pretty
4206 * common, so there's no point in logging anything. */
4213 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4214 const char *cmd_name)
4216 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4217 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4218 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4226 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4227 int cmd, const char *cmd_name)
4232 ifr.ifr_addr.sa_family = AF_INET;
4233 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4235 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4236 *ip = sin->sin_addr;
4241 /* Returns an AF_PACKET raw socket or a negative errno value. */
4243 af_packet_sock(void)
4245 static int sock = INT_MIN;
4247 if (sock == INT_MIN) {
4248 sock = socket(AF_PACKET, SOCK_RAW, 0);
4250 set_nonblocking(sock);
4253 VLOG_ERR("failed to create packet socket: %s", strerror(errno));