2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct rtnetlink_notifier netdev_linux_cache_notifier;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_CARRIER = 1 << 5,
118 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
119 VALID_POLICING = 1 << 7,
120 VALID_HAVE_VPORT_STATS = 1 << 8
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct shash *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct shash *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct shash *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct shash *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
372 bool is_internal; /* Is this an openvswitch internal device? */
373 bool is_tap; /* Is this a tuntap device? */
374 uint32_t kbits_rate; /* Policing data. */
375 uint32_t kbits_burst;
376 bool have_vport_stats;
380 struct tap_state tap;
384 struct netdev_linux {
385 struct netdev netdev;
389 /* Sockets used for ioctl operations. */
390 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
392 /* A Netlink routing socket that is not subscribed to any multicast groups. */
393 static struct nl_sock *rtnl_sock;
395 /* This is set pretty low because we probably won't learn anything from the
396 * additional log messages. */
397 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
399 static int netdev_linux_init(void);
401 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
402 int cmd, const char *cmd_name);
403 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
404 const char *cmd_name);
405 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
406 int cmd, const char *cmd_name);
407 static int get_flags(const struct netdev *, int *flagsp);
408 static int set_flags(struct netdev *, int flags);
409 static int do_get_ifindex(const char *netdev_name);
410 static int get_ifindex(const struct netdev *, int *ifindexp);
411 static int do_set_addr(struct netdev *netdev,
412 int ioctl_nr, const char *ioctl_name,
413 struct in_addr addr);
414 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
415 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
416 const uint8_t[ETH_ADDR_LEN]);
417 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
418 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_notifier_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_notifier_wait();
483 netdev_linux_miimon_wait();
487 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
490 if (!dev->change_seq) {
493 dev->cache_valid = 0;
497 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
498 void *aux OVS_UNUSED)
500 struct netdev_dev_linux *dev;
502 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
504 const struct netdev_class *netdev_class =
505 netdev_dev_get_class(base_dev);
507 if (is_netdev_linux_class(netdev_class)) {
508 dev = netdev_dev_linux_cast(base_dev);
509 netdev_dev_linux_changed(dev);
513 struct shash device_shash;
514 struct shash_node *node;
516 shash_init(&device_shash);
517 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
518 SHASH_FOR_EACH (node, &device_shash) {
520 netdev_dev_linux_changed(dev);
522 shash_destroy(&device_shash);
526 /* Creates system and internal devices. */
528 netdev_linux_create(const struct netdev_class *class, const char *name,
529 struct netdev_dev **netdev_devp)
531 struct netdev_dev_linux *netdev_dev;
534 if (!cache_notifier_refcount) {
535 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
536 netdev_linux_cache_cb, NULL);
541 cache_notifier_refcount++;
543 netdev_dev = xzalloc(sizeof *netdev_dev);
544 netdev_dev->change_seq = 1;
545 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
547 *netdev_devp = &netdev_dev->netdev_dev;
551 /* For most types of netdevs we open the device for each call of
552 * netdev_open(). However, this is not the case with tap devices,
553 * since it is only possible to open the device once. In this
554 * situation we share a single file descriptor, and consequently
555 * buffers, across all readers. Therefore once data is read it will
556 * be unavailable to other reads for tap devices. */
558 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
559 const char *name, struct netdev_dev **netdev_devp)
561 struct netdev_dev_linux *netdev_dev;
562 struct tap_state *state;
563 static const char tap_dev[] = "/dev/net/tun";
567 netdev_dev = xzalloc(sizeof *netdev_dev);
568 state = &netdev_dev->state.tap;
570 /* Open tap device. */
571 state->fd = open(tap_dev, O_RDWR);
574 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
578 /* Create tap device. */
579 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
580 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
581 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
582 VLOG_WARN("%s: creating tap device failed: %s", name,
588 /* Make non-blocking. */
589 error = set_nonblocking(state->fd);
594 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
595 *netdev_devp = &netdev_dev->netdev_dev;
604 destroy_tap(struct netdev_dev_linux *netdev_dev)
606 struct tap_state *state = &netdev_dev->state.tap;
608 if (state->fd >= 0) {
613 /* Destroys the netdev device 'netdev_dev_'. */
615 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
617 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
618 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
620 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
621 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
624 if (class == &netdev_linux_class || class == &netdev_internal_class) {
625 cache_notifier_refcount--;
627 if (!cache_notifier_refcount) {
628 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
630 } else if (class == &netdev_tap_class) {
631 destroy_tap(netdev_dev);
640 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
642 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
643 struct netdev_linux *netdev;
644 enum netdev_flags flags;
647 /* Allocate network device. */
648 netdev = xzalloc(sizeof *netdev);
650 netdev_init(&netdev->netdev, netdev_dev_);
652 /* Verify that the device really exists, by attempting to read its flags.
653 * (The flags might be cached, in which case this won't actually do an
656 * Don't do this for "internal" netdevs, though, because those have to be
657 * created as netdev objects before they exist in the kernel, because
658 * creating them in the kernel happens by passing a netdev object to
659 * dpif_port_add(). */
660 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
661 error = netdev_get_flags(&netdev->netdev, &flags);
662 if (error == ENODEV) {
667 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
668 !netdev_dev->state.tap.opened) {
670 /* We assume that the first user of the tap device is the primary user
671 * and give them the tap FD. Subsequent users probably just expect
672 * this to be a system device so open it normally to avoid send/receive
673 * directions appearing to be reversed. */
674 netdev->fd = netdev_dev->state.tap.fd;
675 netdev_dev->state.tap.opened = true;
678 *netdevp = &netdev->netdev;
682 netdev_uninit(&netdev->netdev, true);
686 /* Closes and destroys 'netdev'. */
688 netdev_linux_close(struct netdev *netdev_)
690 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
692 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
698 /* Initializes 'sset' with a list of the names of all known network devices. */
700 netdev_linux_enumerate(struct sset *sset)
702 struct if_nameindex *names;
704 names = if_nameindex();
708 for (i = 0; names[i].if_name != NULL; i++) {
709 sset_add(sset, names[i].if_name);
711 if_freenameindex(names);
714 VLOG_WARN("could not obtain list of network device names: %s",
721 netdev_linux_listen(struct netdev *netdev_)
723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 struct sockaddr_ll sll;
729 if (netdev->fd >= 0) {
733 /* Create file descriptor. */
734 fd = socket(PF_PACKET, SOCK_RAW, 0);
737 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
741 /* Set non-blocking mode. */
742 error = set_nonblocking(fd);
747 /* Get ethernet device index. */
748 error = get_ifindex(&netdev->netdev, &ifindex);
753 /* Bind to specific ethernet device. */
754 memset(&sll, 0, sizeof sll);
755 sll.sll_family = AF_PACKET;
756 sll.sll_ifindex = ifindex;
757 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
758 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
760 VLOG_ERR("%s: failed to bind raw socket (%s)",
761 netdev_get_name(netdev_), strerror(error));
776 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
778 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
780 if (netdev->fd < 0) {
781 /* Device is not listening. */
786 ssize_t retval = read(netdev->fd, data, size);
789 } else if (errno != EINTR) {
790 if (errno != EAGAIN) {
791 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
792 strerror(errno), netdev_get_name(netdev_));
799 /* Registers with the poll loop to wake up from the next call to poll_block()
800 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
802 netdev_linux_recv_wait(struct netdev *netdev_)
804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
805 if (netdev->fd >= 0) {
806 poll_fd_wait(netdev->fd, POLLIN);
810 /* Discards all packets waiting to be received from 'netdev'. */
812 netdev_linux_drain(struct netdev *netdev_)
814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
815 if (netdev->fd < 0) {
817 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
819 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
820 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
824 drain_fd(netdev->fd, ifr.ifr_qlen);
827 return drain_rcvbuf(netdev->fd);
831 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
832 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
833 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
834 * the packet is too big or too small to transmit on the device.
836 * The caller retains ownership of 'buffer' in all cases.
838 * The kernel maintains a packet transmission queue, so the caller is not
839 * expected to do additional queuing of packets. */
841 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
843 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
847 if (netdev->fd < 0) {
848 /* Use our AF_PACKET socket to send to this device. */
849 struct sockaddr_ll sll;
856 sock = af_packet_sock();
861 error = get_ifindex(netdev_, &ifindex);
866 /* We don't bother setting most fields in sockaddr_ll because the
867 * kernel ignores them for SOCK_RAW. */
868 memset(&sll, 0, sizeof sll);
869 sll.sll_family = AF_PACKET;
870 sll.sll_ifindex = ifindex;
872 iov.iov_base = (void *) data;
876 msg.msg_namelen = sizeof sll;
879 msg.msg_control = NULL;
880 msg.msg_controllen = 0;
883 retval = sendmsg(sock, &msg, 0);
885 /* Use the netdev's own fd to send to this device. This is
886 * essential for tap devices, because packets sent to a tap device
887 * with an AF_PACKET socket will loop back to be *received* again
888 * on the tap device. */
889 retval = write(netdev->fd, data, size);
893 /* The Linux AF_PACKET implementation never blocks waiting for room
894 * for packets, instead returning ENOBUFS. Translate this into
895 * EAGAIN for the caller. */
896 if (errno == ENOBUFS) {
898 } else if (errno == EINTR) {
900 } else if (errno != EAGAIN) {
901 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
902 netdev_get_name(netdev_), strerror(errno));
905 } else if (retval != size) {
906 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
907 "%zu) on %s", retval, size, netdev_get_name(netdev_));
915 /* Registers with the poll loop to wake up from the next call to poll_block()
916 * when the packet transmission queue has sufficient room to transmit a packet
917 * with netdev_send().
919 * The kernel maintains a packet transmission queue, so the client is not
920 * expected to do additional queuing of packets. Thus, this function is
921 * unlikely to ever be used. It is included for completeness. */
923 netdev_linux_send_wait(struct netdev *netdev_)
925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
926 if (netdev->fd < 0) {
928 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
929 poll_fd_wait(netdev->fd, POLLOUT);
931 /* TAP device always accepts packets.*/
932 poll_immediate_wake();
936 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
937 * otherwise a positive errno value. */
939 netdev_linux_set_etheraddr(struct netdev *netdev_,
940 const uint8_t mac[ETH_ADDR_LEN])
942 struct netdev_dev_linux *netdev_dev =
943 netdev_dev_linux_cast(netdev_get_dev(netdev_));
946 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
947 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
948 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
950 netdev_dev->cache_valid |= VALID_ETHERADDR;
951 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
959 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
960 * free the returned buffer. */
962 netdev_linux_get_etheraddr(const struct netdev *netdev_,
963 uint8_t mac[ETH_ADDR_LEN])
965 struct netdev_dev_linux *netdev_dev =
966 netdev_dev_linux_cast(netdev_get_dev(netdev_));
967 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
968 int error = get_etheraddr(netdev_get_name(netdev_),
969 netdev_dev->etheraddr);
973 netdev_dev->cache_valid |= VALID_ETHERADDR;
975 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
979 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
980 * in bytes, not including the hardware header; thus, this is typically 1500
981 * bytes for Ethernet devices. */
983 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
985 struct netdev_dev_linux *netdev_dev =
986 netdev_dev_linux_cast(netdev_get_dev(netdev_));
987 if (!(netdev_dev->cache_valid & VALID_MTU)) {
991 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
992 SIOCGIFMTU, "SIOCGIFMTU");
996 netdev_dev->mtu = ifr.ifr_mtu;
997 netdev_dev->cache_valid |= VALID_MTU;
999 *mtup = netdev_dev->mtu;
1003 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1004 * On failure, returns a negative errno value. */
1006 netdev_linux_get_ifindex(const struct netdev *netdev)
1010 error = get_ifindex(netdev, &ifindex);
1011 return error ? -error : ifindex;
1015 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1017 struct netdev_dev_linux *netdev_dev =
1018 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1023 if (netdev_dev->miimon_interval > 0) {
1024 *carrier = netdev_dev->miimon;
1028 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1032 fn = xasprintf("/sys/class/net/%s/carrier",
1033 netdev_get_name(netdev_));
1034 fd = open(fn, O_RDONLY);
1037 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1041 retval = read(fd, line, sizeof line);
1044 if (error == EINVAL) {
1045 /* This is the normal return value when we try to check carrier
1046 * if the network device is not up. */
1048 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1051 } else if (retval == 0) {
1053 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1057 if (line[0] != '0' && line[0] != '1') {
1059 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1063 netdev_dev->carrier = line[0] != '0';
1064 netdev_dev->cache_valid |= VALID_CARRIER;
1066 *carrier = netdev_dev->carrier;
1078 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1079 struct mii_ioctl_data *data)
1084 memset(&ifr, 0, sizeof ifr);
1085 memcpy(&ifr.ifr_data, data, sizeof *data);
1086 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1087 memcpy(data, &ifr.ifr_data, sizeof *data);
1093 netdev_linux_get_miimon(const char *name, bool *miimon)
1095 struct mii_ioctl_data data;
1100 memset(&data, 0, sizeof data);
1101 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1103 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1104 data.reg_num = MII_BMSR;
1105 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1109 *miimon = !!(data.val_out & BMSR_LSTATUS);
1111 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1114 struct ethtool_cmd ecmd;
1116 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1119 memset(&ecmd, 0, sizeof ecmd);
1120 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1123 struct ethtool_value eval;
1125 memcpy(&eval, &ecmd, sizeof eval);
1126 *miimon = !!eval.data;
1128 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1136 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1137 long long int interval)
1139 struct netdev_dev_linux *netdev_dev;
1141 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1143 interval = interval > 0 ? MAX(interval, 100) : 0;
1144 if (netdev_dev->miimon_interval != interval) {
1145 netdev_dev->miimon_interval = interval;
1146 timer_set_expired(&netdev_dev->miimon_timer);
1153 netdev_linux_miimon_run(void)
1155 struct shash device_shash;
1156 struct shash_node *node;
1158 shash_init(&device_shash);
1159 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1160 SHASH_FOR_EACH (node, &device_shash) {
1161 struct netdev_dev_linux *dev = node->data;
1164 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1168 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1169 if (miimon != dev->miimon) {
1170 dev->miimon = miimon;
1171 netdev_dev_linux_changed(dev);
1174 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1177 shash_destroy(&device_shash);
1181 netdev_linux_miimon_wait(void)
1183 struct shash device_shash;
1184 struct shash_node *node;
1186 shash_init(&device_shash);
1187 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1188 SHASH_FOR_EACH (node, &device_shash) {
1189 struct netdev_dev_linux *dev = node->data;
1191 if (dev->miimon_interval > 0) {
1192 timer_wait(&dev->miimon_timer);
1195 shash_destroy(&device_shash);
1198 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1199 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1202 check_for_working_netlink_stats(void)
1204 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1205 * preferable, so if that works, we'll use it. */
1206 int ifindex = do_get_ifindex("lo");
1208 VLOG_WARN("failed to get ifindex for lo, "
1209 "obtaining netdev stats from proc");
1212 struct netdev_stats stats;
1213 int error = get_stats_via_netlink(ifindex, &stats);
1215 VLOG_DBG("obtaining netdev stats via rtnetlink");
1218 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1219 "via proc (you are probably running a pre-2.6.19 "
1220 "kernel)", strerror(error));
1226 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1228 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1230 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1231 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1232 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1234 netdev_dev->is_tap = !strcmp(type, "tap");
1235 netdev_dev->is_internal = (!netdev_dev->is_tap
1236 && dpif_linux_is_internal_device(name));
1237 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1242 swap_uint64(uint64_t *a, uint64_t *b)
1249 /* Retrieves current device stats for 'netdev'. */
1251 netdev_linux_get_stats(const struct netdev *netdev_,
1252 struct netdev_stats *stats)
1254 struct netdev_dev_linux *netdev_dev =
1255 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1256 static int use_netlink_stats = -1;
1259 if (netdev_dev->have_vport_stats ||
1260 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1262 error = netdev_vport_get_stats(netdev_, stats);
1263 netdev_dev->have_vport_stats = !error;
1264 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1267 if (!netdev_dev->have_vport_stats) {
1268 if (use_netlink_stats < 0) {
1269 use_netlink_stats = check_for_working_netlink_stats();
1271 if (use_netlink_stats) {
1274 error = get_ifindex(netdev_, &ifindex);
1276 error = get_stats_via_netlink(ifindex, stats);
1279 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1283 /* If this port is an internal port then the transmit and receive stats
1284 * will appear to be swapped relative to the other ports since we are the
1285 * one sending the data, not a remote computer. For consistency, we swap
1286 * them back here. This does not apply if we are getting stats from the
1287 * vport layer because it always tracks stats from the perspective of the
1289 netdev_linux_update_is_pseudo(netdev_dev);
1290 if (!error && !netdev_dev->have_vport_stats &&
1291 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1292 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1293 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1294 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1295 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1296 stats->rx_length_errors = 0;
1297 stats->rx_over_errors = 0;
1298 stats->rx_crc_errors = 0;
1299 stats->rx_frame_errors = 0;
1300 stats->rx_fifo_errors = 0;
1301 stats->rx_missed_errors = 0;
1302 stats->tx_aborted_errors = 0;
1303 stats->tx_carrier_errors = 0;
1304 stats->tx_fifo_errors = 0;
1305 stats->tx_heartbeat_errors = 0;
1306 stats->tx_window_errors = 0;
1312 /* Stores the features supported by 'netdev' into each of '*current',
1313 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1314 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1315 * successful, otherwise a positive errno value. */
1317 netdev_linux_get_features(const struct netdev *netdev,
1318 uint32_t *current, uint32_t *advertised,
1319 uint32_t *supported, uint32_t *peer)
1321 struct ethtool_cmd ecmd;
1324 memset(&ecmd, 0, sizeof ecmd);
1325 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1326 ETHTOOL_GSET, "ETHTOOL_GSET");
1331 /* Supported features. */
1333 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1334 *supported |= OFPPF_10MB_HD;
1336 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1337 *supported |= OFPPF_10MB_FD;
1339 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1340 *supported |= OFPPF_100MB_HD;
1342 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1343 *supported |= OFPPF_100MB_FD;
1345 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1346 *supported |= OFPPF_1GB_HD;
1348 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1349 *supported |= OFPPF_1GB_FD;
1351 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1352 *supported |= OFPPF_10GB_FD;
1354 if (ecmd.supported & SUPPORTED_TP) {
1355 *supported |= OFPPF_COPPER;
1357 if (ecmd.supported & SUPPORTED_FIBRE) {
1358 *supported |= OFPPF_FIBER;
1360 if (ecmd.supported & SUPPORTED_Autoneg) {
1361 *supported |= OFPPF_AUTONEG;
1363 if (ecmd.supported & SUPPORTED_Pause) {
1364 *supported |= OFPPF_PAUSE;
1366 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1367 *supported |= OFPPF_PAUSE_ASYM;
1370 /* Advertised features. */
1372 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1373 *advertised |= OFPPF_10MB_HD;
1375 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1376 *advertised |= OFPPF_10MB_FD;
1378 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1379 *advertised |= OFPPF_100MB_HD;
1381 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1382 *advertised |= OFPPF_100MB_FD;
1384 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1385 *advertised |= OFPPF_1GB_HD;
1387 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1388 *advertised |= OFPPF_1GB_FD;
1390 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1391 *advertised |= OFPPF_10GB_FD;
1393 if (ecmd.advertising & ADVERTISED_TP) {
1394 *advertised |= OFPPF_COPPER;
1396 if (ecmd.advertising & ADVERTISED_FIBRE) {
1397 *advertised |= OFPPF_FIBER;
1399 if (ecmd.advertising & ADVERTISED_Autoneg) {
1400 *advertised |= OFPPF_AUTONEG;
1402 if (ecmd.advertising & ADVERTISED_Pause) {
1403 *advertised |= OFPPF_PAUSE;
1405 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1406 *advertised |= OFPPF_PAUSE_ASYM;
1409 /* Current settings. */
1410 if (ecmd.speed == SPEED_10) {
1411 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1412 } else if (ecmd.speed == SPEED_100) {
1413 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1414 } else if (ecmd.speed == SPEED_1000) {
1415 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1416 } else if (ecmd.speed == SPEED_10000) {
1417 *current = OFPPF_10GB_FD;
1422 if (ecmd.port == PORT_TP) {
1423 *current |= OFPPF_COPPER;
1424 } else if (ecmd.port == PORT_FIBRE) {
1425 *current |= OFPPF_FIBER;
1429 *current |= OFPPF_AUTONEG;
1432 /* Peer advertisements. */
1433 *peer = 0; /* XXX */
1438 /* Set the features advertised by 'netdev' to 'advertise'. */
1440 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1442 struct ethtool_cmd ecmd;
1445 memset(&ecmd, 0, sizeof ecmd);
1446 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1447 ETHTOOL_GSET, "ETHTOOL_GSET");
1452 ecmd.advertising = 0;
1453 if (advertise & OFPPF_10MB_HD) {
1454 ecmd.advertising |= ADVERTISED_10baseT_Half;
1456 if (advertise & OFPPF_10MB_FD) {
1457 ecmd.advertising |= ADVERTISED_10baseT_Full;
1459 if (advertise & OFPPF_100MB_HD) {
1460 ecmd.advertising |= ADVERTISED_100baseT_Half;
1462 if (advertise & OFPPF_100MB_FD) {
1463 ecmd.advertising |= ADVERTISED_100baseT_Full;
1465 if (advertise & OFPPF_1GB_HD) {
1466 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1468 if (advertise & OFPPF_1GB_FD) {
1469 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1471 if (advertise & OFPPF_10GB_FD) {
1472 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1474 if (advertise & OFPPF_COPPER) {
1475 ecmd.advertising |= ADVERTISED_TP;
1477 if (advertise & OFPPF_FIBER) {
1478 ecmd.advertising |= ADVERTISED_FIBRE;
1480 if (advertise & OFPPF_AUTONEG) {
1481 ecmd.advertising |= ADVERTISED_Autoneg;
1483 if (advertise & OFPPF_PAUSE) {
1484 ecmd.advertising |= ADVERTISED_Pause;
1486 if (advertise & OFPPF_PAUSE_ASYM) {
1487 ecmd.advertising |= ADVERTISED_Asym_Pause;
1489 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1490 ETHTOOL_SSET, "ETHTOOL_SSET");
1493 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1494 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1495 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1496 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1497 * sets '*vlan_vid' to -1. */
1499 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1501 const char *netdev_name = netdev_get_name(netdev);
1502 struct ds line = DS_EMPTY_INITIALIZER;
1503 FILE *stream = NULL;
1507 COVERAGE_INC(netdev_get_vlan_vid);
1508 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1509 stream = fopen(fn, "r");
1515 if (ds_get_line(&line, stream)) {
1516 if (ferror(stream)) {
1518 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1521 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1526 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1528 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1529 fn, ds_cstr(&line));
1547 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1548 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1550 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1551 * positive errno value.
1553 * This function is equivalent to running
1554 * /sbin/tc qdisc del dev %s handle ffff: ingress
1555 * but it is much, much faster.
1558 netdev_linux_remove_policing(struct netdev *netdev)
1560 struct netdev_dev_linux *netdev_dev =
1561 netdev_dev_linux_cast(netdev_get_dev(netdev));
1562 const char *netdev_name = netdev_get_name(netdev);
1564 struct ofpbuf request;
1565 struct tcmsg *tcmsg;
1568 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1572 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1573 tcmsg->tcm_parent = TC_H_INGRESS;
1574 nl_msg_put_string(&request, TCA_KIND, "ingress");
1575 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1577 error = tc_transact(&request, NULL);
1578 if (error && error != ENOENT && error != EINVAL) {
1579 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1580 netdev_name, strerror(error));
1584 netdev_dev->kbits_rate = 0;
1585 netdev_dev->kbits_burst = 0;
1586 netdev_dev->cache_valid |= VALID_POLICING;
1590 /* Attempts to set input rate limiting (policing) policy. */
1592 netdev_linux_set_policing(struct netdev *netdev,
1593 uint32_t kbits_rate, uint32_t kbits_burst)
1595 struct netdev_dev_linux *netdev_dev =
1596 netdev_dev_linux_cast(netdev_get_dev(netdev));
1597 const char *netdev_name = netdev_get_name(netdev);
1600 COVERAGE_INC(netdev_set_policing);
1602 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1603 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1604 : kbits_burst); /* Stick with user-specified value. */
1606 if (netdev_dev->cache_valid & VALID_POLICING
1607 && netdev_dev->kbits_rate == kbits_rate
1608 && netdev_dev->kbits_burst == kbits_burst) {
1609 /* Assume that settings haven't changed since we last set them. */
1613 netdev_linux_remove_policing(netdev);
1615 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1616 if (system(command) != 0) {
1617 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1621 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1622 kbits_rate, kbits_burst);
1623 if (system(command) != 0) {
1624 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1629 netdev_dev->kbits_rate = kbits_rate;
1630 netdev_dev->kbits_burst = kbits_burst;
1631 netdev_dev->cache_valid |= VALID_POLICING;
1638 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1641 const struct tc_ops **opsp;
1643 for (opsp = tcs; *opsp != NULL; opsp++) {
1644 const struct tc_ops *ops = *opsp;
1645 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1646 sset_add(types, ops->ovs_name);
1652 static const struct tc_ops *
1653 tc_lookup_ovs_name(const char *name)
1655 const struct tc_ops **opsp;
1657 for (opsp = tcs; *opsp != NULL; opsp++) {
1658 const struct tc_ops *ops = *opsp;
1659 if (!strcmp(name, ops->ovs_name)) {
1666 static const struct tc_ops *
1667 tc_lookup_linux_name(const char *name)
1669 const struct tc_ops **opsp;
1671 for (opsp = tcs; *opsp != NULL; opsp++) {
1672 const struct tc_ops *ops = *opsp;
1673 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1680 static struct tc_queue *
1681 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1684 struct netdev_dev_linux *netdev_dev =
1685 netdev_dev_linux_cast(netdev_get_dev(netdev));
1686 struct tc_queue *queue;
1688 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1689 if (queue->queue_id == queue_id) {
1696 static struct tc_queue *
1697 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1699 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1703 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1705 struct netdev_qos_capabilities *caps)
1707 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1711 caps->n_queues = ops->n_queues;
1716 netdev_linux_get_qos(const struct netdev *netdev,
1717 const char **typep, struct shash *details)
1719 struct netdev_dev_linux *netdev_dev =
1720 netdev_dev_linux_cast(netdev_get_dev(netdev));
1723 error = tc_query_qdisc(netdev);
1728 *typep = netdev_dev->tc->ops->ovs_name;
1729 return (netdev_dev->tc->ops->qdisc_get
1730 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1735 netdev_linux_set_qos(struct netdev *netdev,
1736 const char *type, const struct shash *details)
1738 struct netdev_dev_linux *netdev_dev =
1739 netdev_dev_linux_cast(netdev_get_dev(netdev));
1740 const struct tc_ops *new_ops;
1743 new_ops = tc_lookup_ovs_name(type);
1744 if (!new_ops || !new_ops->tc_install) {
1748 error = tc_query_qdisc(netdev);
1753 if (new_ops == netdev_dev->tc->ops) {
1754 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1756 /* Delete existing qdisc. */
1757 error = tc_del_qdisc(netdev);
1761 assert(netdev_dev->tc == NULL);
1763 /* Install new qdisc. */
1764 error = new_ops->tc_install(netdev, details);
1765 assert((error == 0) == (netdev_dev->tc != NULL));
1772 netdev_linux_get_queue(const struct netdev *netdev,
1773 unsigned int queue_id, struct shash *details)
1775 struct netdev_dev_linux *netdev_dev =
1776 netdev_dev_linux_cast(netdev_get_dev(netdev));
1779 error = tc_query_qdisc(netdev);
1783 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1785 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1791 netdev_linux_set_queue(struct netdev *netdev,
1792 unsigned int queue_id, const struct shash *details)
1794 struct netdev_dev_linux *netdev_dev =
1795 netdev_dev_linux_cast(netdev_get_dev(netdev));
1798 error = tc_query_qdisc(netdev);
1801 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1802 || !netdev_dev->tc->ops->class_set) {
1806 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1810 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1812 struct netdev_dev_linux *netdev_dev =
1813 netdev_dev_linux_cast(netdev_get_dev(netdev));
1816 error = tc_query_qdisc(netdev);
1819 } else if (!netdev_dev->tc->ops->class_delete) {
1822 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1824 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1830 netdev_linux_get_queue_stats(const struct netdev *netdev,
1831 unsigned int queue_id,
1832 struct netdev_queue_stats *stats)
1834 struct netdev_dev_linux *netdev_dev =
1835 netdev_dev_linux_cast(netdev_get_dev(netdev));
1838 error = tc_query_qdisc(netdev);
1841 } else if (!netdev_dev->tc->ops->class_get_stats) {
1844 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1846 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1852 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1854 struct ofpbuf request;
1855 struct tcmsg *tcmsg;
1857 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1861 tcmsg->tcm_parent = 0;
1862 nl_dump_start(dump, rtnl_sock, &request);
1863 ofpbuf_uninit(&request);
1868 netdev_linux_dump_queues(const struct netdev *netdev,
1869 netdev_dump_queues_cb *cb, void *aux)
1871 struct netdev_dev_linux *netdev_dev =
1872 netdev_dev_linux_cast(netdev_get_dev(netdev));
1873 struct tc_queue *queue;
1874 struct shash details;
1878 error = tc_query_qdisc(netdev);
1881 } else if (!netdev_dev->tc->ops->class_get) {
1886 shash_init(&details);
1887 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1888 shash_clear(&details);
1890 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1892 (*cb)(queue->queue_id, &details, aux);
1897 shash_destroy(&details);
1903 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1904 netdev_dump_queue_stats_cb *cb, void *aux)
1906 struct netdev_dev_linux *netdev_dev =
1907 netdev_dev_linux_cast(netdev_get_dev(netdev));
1908 struct nl_dump dump;
1913 error = tc_query_qdisc(netdev);
1916 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1921 if (!start_queue_dump(netdev, &dump)) {
1924 while (nl_dump_next(&dump, &msg)) {
1925 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1931 error = nl_dump_done(&dump);
1932 return error ? error : last_error;
1936 netdev_linux_get_in4(const struct netdev *netdev_,
1937 struct in_addr *address, struct in_addr *netmask)
1939 struct netdev_dev_linux *netdev_dev =
1940 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1942 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1945 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1946 SIOCGIFADDR, "SIOCGIFADDR");
1951 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1952 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1957 netdev_dev->cache_valid |= VALID_IN4;
1959 *address = netdev_dev->address;
1960 *netmask = netdev_dev->netmask;
1961 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1965 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1966 struct in_addr netmask)
1968 struct netdev_dev_linux *netdev_dev =
1969 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1972 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1974 netdev_dev->cache_valid |= VALID_IN4;
1975 netdev_dev->address = address;
1976 netdev_dev->netmask = netmask;
1977 if (address.s_addr != INADDR_ANY) {
1978 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1979 "SIOCSIFNETMASK", netmask);
1986 parse_if_inet6_line(const char *line,
1987 struct in6_addr *in6, char ifname[16 + 1])
1989 uint8_t *s6 = in6->s6_addr;
1990 #define X8 "%2"SCNx8
1992 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1993 "%*x %*x %*x %*x %16s\n",
1994 &s6[0], &s6[1], &s6[2], &s6[3],
1995 &s6[4], &s6[5], &s6[6], &s6[7],
1996 &s6[8], &s6[9], &s6[10], &s6[11],
1997 &s6[12], &s6[13], &s6[14], &s6[15],
2001 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2002 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2004 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2006 struct netdev_dev_linux *netdev_dev =
2007 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2008 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2012 netdev_dev->in6 = in6addr_any;
2014 file = fopen("/proc/net/if_inet6", "r");
2016 const char *name = netdev_get_name(netdev_);
2017 while (fgets(line, sizeof line, file)) {
2018 struct in6_addr in6_tmp;
2019 char ifname[16 + 1];
2020 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2021 && !strcmp(name, ifname))
2023 netdev_dev->in6 = in6_tmp;
2029 netdev_dev->cache_valid |= VALID_IN6;
2031 *in6 = netdev_dev->in6;
2036 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2038 struct sockaddr_in sin;
2039 memset(&sin, 0, sizeof sin);
2040 sin.sin_family = AF_INET;
2041 sin.sin_addr = addr;
2044 memset(sa, 0, sizeof *sa);
2045 memcpy(sa, &sin, sizeof sin);
2049 do_set_addr(struct netdev *netdev,
2050 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2053 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2054 make_in4_sockaddr(&ifr.ifr_addr, addr);
2056 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2060 /* Adds 'router' as a default IP gateway. */
2062 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2064 struct in_addr any = { INADDR_ANY };
2068 memset(&rt, 0, sizeof rt);
2069 make_in4_sockaddr(&rt.rt_dst, any);
2070 make_in4_sockaddr(&rt.rt_gateway, router);
2071 make_in4_sockaddr(&rt.rt_genmask, any);
2072 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2073 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2075 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2081 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2084 static const char fn[] = "/proc/net/route";
2089 *netdev_name = NULL;
2090 stream = fopen(fn, "r");
2091 if (stream == NULL) {
2092 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2097 while (fgets(line, sizeof line, stream)) {
2100 ovs_be32 dest, gateway, mask;
2101 int refcnt, metric, mtu;
2102 unsigned int flags, use, window, irtt;
2105 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2107 iface, &dest, &gateway, &flags, &refcnt,
2108 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2110 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2114 if (!(flags & RTF_UP)) {
2115 /* Skip routes that aren't up. */
2119 /* The output of 'dest', 'mask', and 'gateway' were given in
2120 * network byte order, so we don't need need any endian
2121 * conversions here. */
2122 if ((dest & mask) == (host->s_addr & mask)) {
2124 /* The host is directly reachable. */
2125 next_hop->s_addr = 0;
2127 /* To reach the host, we must go through a gateway. */
2128 next_hop->s_addr = gateway;
2130 *netdev_name = xstrdup(iface);
2142 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2144 struct ethtool_drvinfo drvinfo;
2147 memset(&drvinfo, 0, sizeof drvinfo);
2148 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2149 (struct ethtool_cmd *)&drvinfo,
2151 "ETHTOOL_GDRVINFO");
2153 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2154 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2155 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2161 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2162 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2163 * returns 0. Otherwise, it returns a positive errno value; in particular,
2164 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2166 netdev_linux_arp_lookup(const struct netdev *netdev,
2167 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2170 struct sockaddr_in sin;
2173 memset(&r, 0, sizeof r);
2174 memset(&sin, 0, sizeof sin);
2175 sin.sin_family = AF_INET;
2176 sin.sin_addr.s_addr = ip;
2178 memcpy(&r.arp_pa, &sin, sizeof sin);
2179 r.arp_ha.sa_family = ARPHRD_ETHER;
2181 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2182 COVERAGE_INC(netdev_arp_lookup);
2183 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2185 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2186 } else if (retval != ENXIO) {
2187 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2188 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2194 nd_to_iff_flags(enum netdev_flags nd)
2197 if (nd & NETDEV_UP) {
2200 if (nd & NETDEV_PROMISC) {
2207 iff_to_nd_flags(int iff)
2209 enum netdev_flags nd = 0;
2213 if (iff & IFF_PROMISC) {
2214 nd |= NETDEV_PROMISC;
2220 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2221 enum netdev_flags on, enum netdev_flags *old_flagsp)
2223 int old_flags, new_flags;
2226 error = get_flags(netdev, &old_flags);
2228 *old_flagsp = iff_to_nd_flags(old_flags);
2229 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2230 if (new_flags != old_flags) {
2231 error = set_flags(netdev, new_flags);
2238 netdev_linux_change_seq(const struct netdev *netdev)
2240 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2243 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2247 netdev_linux_init, \
2249 netdev_linux_wait, \
2252 netdev_linux_destroy, \
2253 NULL, /* get_config */ \
2254 NULL, /* set_config */ \
2256 netdev_linux_open, \
2257 netdev_linux_close, \
2261 netdev_linux_listen, \
2262 netdev_linux_recv, \
2263 netdev_linux_recv_wait, \
2264 netdev_linux_drain, \
2266 netdev_linux_send, \
2267 netdev_linux_send_wait, \
2269 netdev_linux_set_etheraddr, \
2270 netdev_linux_get_etheraddr, \
2271 netdev_linux_get_mtu, \
2272 netdev_linux_get_ifindex, \
2273 netdev_linux_get_carrier, \
2274 netdev_linux_set_miimon_interval, \
2275 netdev_linux_get_stats, \
2278 netdev_linux_get_features, \
2279 netdev_linux_set_advertisements, \
2280 netdev_linux_get_vlan_vid, \
2282 netdev_linux_set_policing, \
2283 netdev_linux_get_qos_types, \
2284 netdev_linux_get_qos_capabilities, \
2285 netdev_linux_get_qos, \
2286 netdev_linux_set_qos, \
2287 netdev_linux_get_queue, \
2288 netdev_linux_set_queue, \
2289 netdev_linux_delete_queue, \
2290 netdev_linux_get_queue_stats, \
2291 netdev_linux_dump_queues, \
2292 netdev_linux_dump_queue_stats, \
2294 netdev_linux_get_in4, \
2295 netdev_linux_set_in4, \
2296 netdev_linux_get_in6, \
2297 netdev_linux_add_router, \
2298 netdev_linux_get_next_hop, \
2299 netdev_linux_get_status, \
2300 netdev_linux_arp_lookup, \
2302 netdev_linux_update_flags, \
2304 netdev_linux_change_seq \
2307 const struct netdev_class netdev_linux_class =
2310 netdev_linux_create,
2311 netdev_linux_enumerate,
2312 NULL); /* set_stats */
2314 const struct netdev_class netdev_tap_class =
2317 netdev_linux_create_tap,
2318 NULL, /* enumerate */
2319 NULL); /* set_stats */
2321 const struct netdev_class netdev_internal_class =
2324 netdev_linux_create,
2325 NULL, /* enumerate */
2326 netdev_vport_set_stats);
2328 /* HTB traffic control class. */
2330 #define HTB_N_QUEUES 0xf000
2334 unsigned int max_rate; /* In bytes/s. */
2338 struct tc_queue tc_queue;
2339 unsigned int min_rate; /* In bytes/s. */
2340 unsigned int max_rate; /* In bytes/s. */
2341 unsigned int burst; /* In bytes. */
2342 unsigned int priority; /* Lower values are higher priorities. */
2346 htb_get__(const struct netdev *netdev)
2348 struct netdev_dev_linux *netdev_dev =
2349 netdev_dev_linux_cast(netdev_get_dev(netdev));
2350 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2354 htb_install__(struct netdev *netdev, uint64_t max_rate)
2356 struct netdev_dev_linux *netdev_dev =
2357 netdev_dev_linux_cast(netdev_get_dev(netdev));
2360 htb = xmalloc(sizeof *htb);
2361 tc_init(&htb->tc, &tc_ops_htb);
2362 htb->max_rate = max_rate;
2364 netdev_dev->tc = &htb->tc;
2367 /* Create an HTB qdisc.
2369 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2371 htb_setup_qdisc__(struct netdev *netdev)
2374 struct tc_htb_glob opt;
2375 struct ofpbuf request;
2376 struct tcmsg *tcmsg;
2378 tc_del_qdisc(netdev);
2380 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2381 NLM_F_EXCL | NLM_F_CREATE, &request);
2385 tcmsg->tcm_handle = tc_make_handle(1, 0);
2386 tcmsg->tcm_parent = TC_H_ROOT;
2388 nl_msg_put_string(&request, TCA_KIND, "htb");
2390 memset(&opt, 0, sizeof opt);
2391 opt.rate2quantum = 10;
2395 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2396 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2397 nl_msg_end_nested(&request, opt_offset);
2399 return tc_transact(&request, NULL);
2402 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2403 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2405 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2406 unsigned int parent, struct htb_class *class)
2409 struct tc_htb_opt opt;
2410 struct ofpbuf request;
2411 struct tcmsg *tcmsg;
2415 netdev_get_mtu(netdev, &mtu);
2416 if (mtu == INT_MAX) {
2417 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2418 netdev_get_name(netdev));
2422 memset(&opt, 0, sizeof opt);
2423 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2424 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2425 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2426 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2427 opt.prio = class->priority;
2429 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2433 tcmsg->tcm_handle = handle;
2434 tcmsg->tcm_parent = parent;
2436 nl_msg_put_string(&request, TCA_KIND, "htb");
2437 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2438 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2439 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2440 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2441 nl_msg_end_nested(&request, opt_offset);
2443 error = tc_transact(&request, NULL);
2445 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2446 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2447 netdev_get_name(netdev),
2448 tc_get_major(handle), tc_get_minor(handle),
2449 tc_get_major(parent), tc_get_minor(parent),
2450 class->min_rate, class->max_rate,
2451 class->burst, class->priority, strerror(error));
2456 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2457 * description of them into 'details'. The description complies with the
2458 * specification given in the vswitch database documentation for linux-htb
2461 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2463 static const struct nl_policy tca_htb_policy[] = {
2464 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2465 .min_len = sizeof(struct tc_htb_opt) },
2468 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2469 const struct tc_htb_opt *htb;
2471 if (!nl_parse_nested(nl_options, tca_htb_policy,
2472 attrs, ARRAY_SIZE(tca_htb_policy))) {
2473 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2477 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2478 class->min_rate = htb->rate.rate;
2479 class->max_rate = htb->ceil.rate;
2480 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2481 class->priority = htb->prio;
2486 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2487 struct htb_class *options,
2488 struct netdev_queue_stats *stats)
2490 struct nlattr *nl_options;
2491 unsigned int handle;
2494 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2495 if (!error && queue_id) {
2496 unsigned int major = tc_get_major(handle);
2497 unsigned int minor = tc_get_minor(handle);
2498 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2499 *queue_id = minor - 1;
2504 if (!error && options) {
2505 error = htb_parse_tca_options__(nl_options, options);
2511 htb_parse_qdisc_details__(struct netdev *netdev,
2512 const struct shash *details, struct htb_class *hc)
2514 const char *max_rate_s;
2516 max_rate_s = shash_find_data(details, "max-rate");
2517 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2518 if (!hc->max_rate) {
2521 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2522 hc->max_rate = netdev_features_to_bps(current) / 8;
2524 hc->min_rate = hc->max_rate;
2530 htb_parse_class_details__(struct netdev *netdev,
2531 const struct shash *details, struct htb_class *hc)
2533 const struct htb *htb = htb_get__(netdev);
2534 const char *min_rate_s = shash_find_data(details, "min-rate");
2535 const char *max_rate_s = shash_find_data(details, "max-rate");
2536 const char *burst_s = shash_find_data(details, "burst");
2537 const char *priority_s = shash_find_data(details, "priority");
2540 netdev_get_mtu(netdev, &mtu);
2541 if (mtu == INT_MAX) {
2542 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2543 netdev_get_name(netdev));
2547 /* HTB requires at least an mtu sized min-rate to send any traffic even
2548 * on uncongested links. */
2549 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2550 hc->min_rate = MAX(hc->min_rate, mtu);
2551 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2554 hc->max_rate = (max_rate_s
2555 ? strtoull(max_rate_s, NULL, 10) / 8
2557 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2558 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2562 * According to hints in the documentation that I've read, it is important
2563 * that 'burst' be at least as big as the largest frame that might be
2564 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2565 * but having it a bit too small is a problem. Since netdev_get_mtu()
2566 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2567 * the MTU. We actually add 64, instead of 14, as a guard against
2568 * additional headers get tacked on somewhere that we're not aware of. */
2569 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2570 hc->burst = MAX(hc->burst, mtu + 64);
2573 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2579 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2580 unsigned int parent, struct htb_class *options,
2581 struct netdev_queue_stats *stats)
2583 struct ofpbuf *reply;
2586 error = tc_query_class(netdev, handle, parent, &reply);
2588 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2589 ofpbuf_delete(reply);
2595 htb_tc_install(struct netdev *netdev, const struct shash *details)
2599 error = htb_setup_qdisc__(netdev);
2601 struct htb_class hc;
2603 htb_parse_qdisc_details__(netdev, details, &hc);
2604 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2605 tc_make_handle(1, 0), &hc);
2607 htb_install__(netdev, hc.max_rate);
2613 static struct htb_class *
2614 htb_class_cast__(const struct tc_queue *queue)
2616 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2620 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2621 const struct htb_class *hc)
2623 struct htb *htb = htb_get__(netdev);
2624 size_t hash = hash_int(queue_id, 0);
2625 struct tc_queue *queue;
2626 struct htb_class *hcp;
2628 queue = tc_find_queue__(netdev, queue_id, hash);
2630 hcp = htb_class_cast__(queue);
2632 hcp = xmalloc(sizeof *hcp);
2633 queue = &hcp->tc_queue;
2634 queue->queue_id = queue_id;
2635 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2638 hcp->min_rate = hc->min_rate;
2639 hcp->max_rate = hc->max_rate;
2640 hcp->burst = hc->burst;
2641 hcp->priority = hc->priority;
2645 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2648 struct nl_dump dump;
2649 struct htb_class hc;
2651 /* Get qdisc options. */
2653 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2654 htb_install__(netdev, hc.max_rate);
2657 if (!start_queue_dump(netdev, &dump)) {
2660 while (nl_dump_next(&dump, &msg)) {
2661 unsigned int queue_id;
2663 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2664 htb_update_queue__(netdev, queue_id, &hc);
2667 nl_dump_done(&dump);
2673 htb_tc_destroy(struct tc *tc)
2675 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2676 struct htb_class *hc, *next;
2678 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2679 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2687 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2689 const struct htb *htb = htb_get__(netdev);
2690 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2695 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2697 struct htb_class hc;
2700 htb_parse_qdisc_details__(netdev, details, &hc);
2701 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2702 tc_make_handle(1, 0), &hc);
2704 htb_get__(netdev)->max_rate = hc.max_rate;
2710 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2711 const struct tc_queue *queue, struct shash *details)
2713 const struct htb_class *hc = htb_class_cast__(queue);
2715 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2716 if (hc->min_rate != hc->max_rate) {
2717 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2719 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2721 shash_add(details, "priority", xasprintf("%u", hc->priority));
2727 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2728 const struct shash *details)
2730 struct htb_class hc;
2733 error = htb_parse_class_details__(netdev, details, &hc);
2738 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2739 tc_make_handle(1, 0xfffe), &hc);
2744 htb_update_queue__(netdev, queue_id, &hc);
2749 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2751 struct htb_class *hc = htb_class_cast__(queue);
2752 struct htb *htb = htb_get__(netdev);
2755 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2757 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2764 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2765 struct netdev_queue_stats *stats)
2767 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2768 tc_make_handle(1, 0xfffe), NULL, stats);
2772 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2773 const struct ofpbuf *nlmsg,
2774 netdev_dump_queue_stats_cb *cb, void *aux)
2776 struct netdev_queue_stats stats;
2777 unsigned int handle, major, minor;
2780 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2785 major = tc_get_major(handle);
2786 minor = tc_get_minor(handle);
2787 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2788 (*cb)(minor - 1, &stats, aux);
2793 static const struct tc_ops tc_ops_htb = {
2794 "htb", /* linux_name */
2795 "linux-htb", /* ovs_name */
2796 HTB_N_QUEUES, /* n_queues */
2805 htb_class_get_stats,
2806 htb_class_dump_stats
2809 /* "linux-hfsc" traffic control class. */
2811 #define HFSC_N_QUEUES 0xf000
2819 struct tc_queue tc_queue;
2824 static struct hfsc *
2825 hfsc_get__(const struct netdev *netdev)
2827 struct netdev_dev_linux *netdev_dev;
2828 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2829 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2832 static struct hfsc_class *
2833 hfsc_class_cast__(const struct tc_queue *queue)
2835 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2839 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2841 struct netdev_dev_linux * netdev_dev;
2844 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2845 hfsc = xmalloc(sizeof *hfsc);
2846 tc_init(&hfsc->tc, &tc_ops_hfsc);
2847 hfsc->max_rate = max_rate;
2848 netdev_dev->tc = &hfsc->tc;
2852 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2853 const struct hfsc_class *hc)
2857 struct hfsc_class *hcp;
2858 struct tc_queue *queue;
2860 hfsc = hfsc_get__(netdev);
2861 hash = hash_int(queue_id, 0);
2863 queue = tc_find_queue__(netdev, queue_id, hash);
2865 hcp = hfsc_class_cast__(queue);
2867 hcp = xmalloc(sizeof *hcp);
2868 queue = &hcp->tc_queue;
2869 queue->queue_id = queue_id;
2870 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2873 hcp->min_rate = hc->min_rate;
2874 hcp->max_rate = hc->max_rate;
2878 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2880 const struct tc_service_curve *rsc, *fsc, *usc;
2881 static const struct nl_policy tca_hfsc_policy[] = {
2883 .type = NL_A_UNSPEC,
2885 .min_len = sizeof(struct tc_service_curve),
2888 .type = NL_A_UNSPEC,
2890 .min_len = sizeof(struct tc_service_curve),
2893 .type = NL_A_UNSPEC,
2895 .min_len = sizeof(struct tc_service_curve),
2898 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2900 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2901 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2902 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2906 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2907 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2908 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2910 if (rsc->m1 != 0 || rsc->d != 0 ||
2911 fsc->m1 != 0 || fsc->d != 0 ||
2912 usc->m1 != 0 || usc->d != 0) {
2913 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2914 "Non-linear service curves are not supported.");
2918 if (rsc->m2 != fsc->m2) {
2919 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2920 "Real-time service curves are not supported ");
2924 if (rsc->m2 > usc->m2) {
2925 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2926 "Min-rate service curve is greater than "
2927 "the max-rate service curve.");
2931 class->min_rate = fsc->m2;
2932 class->max_rate = usc->m2;
2937 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2938 struct hfsc_class *options,
2939 struct netdev_queue_stats *stats)
2942 unsigned int handle;
2943 struct nlattr *nl_options;
2945 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2951 unsigned int major, minor;
2953 major = tc_get_major(handle);
2954 minor = tc_get_minor(handle);
2955 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2956 *queue_id = minor - 1;
2963 error = hfsc_parse_tca_options__(nl_options, options);
2970 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2971 unsigned int parent, struct hfsc_class *options,
2972 struct netdev_queue_stats *stats)
2975 struct ofpbuf *reply;
2977 error = tc_query_class(netdev, handle, parent, &reply);
2982 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2983 ofpbuf_delete(reply);
2988 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2989 struct hfsc_class *class)
2992 const char *max_rate_s;
2994 max_rate_s = shash_find_data(details, "max-rate");
2995 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3000 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3001 max_rate = netdev_features_to_bps(current) / 8;
3004 class->min_rate = max_rate;
3005 class->max_rate = max_rate;
3009 hfsc_parse_class_details__(struct netdev *netdev,
3010 const struct shash *details,
3011 struct hfsc_class * class)
3013 const struct hfsc *hfsc;
3014 uint32_t min_rate, max_rate;
3015 const char *min_rate_s, *max_rate_s;
3017 hfsc = hfsc_get__(netdev);
3018 min_rate_s = shash_find_data(details, "min-rate");
3019 max_rate_s = shash_find_data(details, "max-rate");
3021 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3022 min_rate = MAX(min_rate, 1);
3023 min_rate = MIN(min_rate, hfsc->max_rate);
3025 max_rate = (max_rate_s
3026 ? strtoull(max_rate_s, NULL, 10) / 8
3028 max_rate = MAX(max_rate, min_rate);
3029 max_rate = MIN(max_rate, hfsc->max_rate);
3031 class->min_rate = min_rate;
3032 class->max_rate = max_rate;
3037 /* Create an HFSC qdisc.
3039 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3041 hfsc_setup_qdisc__(struct netdev * netdev)
3043 struct tcmsg *tcmsg;
3044 struct ofpbuf request;
3045 struct tc_hfsc_qopt opt;
3047 tc_del_qdisc(netdev);
3049 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3050 NLM_F_EXCL | NLM_F_CREATE, &request);
3056 tcmsg->tcm_handle = tc_make_handle(1, 0);
3057 tcmsg->tcm_parent = TC_H_ROOT;
3059 memset(&opt, 0, sizeof opt);
3062 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3063 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3065 return tc_transact(&request, NULL);
3068 /* Create an HFSC class.
3070 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3071 * sc rate <min_rate> ul rate <max_rate>" */
3073 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3074 unsigned int parent, struct hfsc_class *class)
3078 struct tcmsg *tcmsg;
3079 struct ofpbuf request;
3080 struct tc_service_curve min, max;
3082 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3088 tcmsg->tcm_handle = handle;
3089 tcmsg->tcm_parent = parent;
3093 min.m2 = class->min_rate;
3097 max.m2 = class->max_rate;
3099 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3100 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3101 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3102 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3103 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3104 nl_msg_end_nested(&request, opt_offset);
3106 error = tc_transact(&request, NULL);
3108 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3109 "min-rate %ubps, max-rate %ubps (%s)",
3110 netdev_get_name(netdev),
3111 tc_get_major(handle), tc_get_minor(handle),
3112 tc_get_major(parent), tc_get_minor(parent),
3113 class->min_rate, class->max_rate, strerror(error));
3120 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3123 struct hfsc_class class;
3125 error = hfsc_setup_qdisc__(netdev);
3131 hfsc_parse_qdisc_details__(netdev, details, &class);
3132 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3133 tc_make_handle(1, 0), &class);
3139 hfsc_install__(netdev, class.max_rate);
3144 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3147 struct nl_dump dump;
3148 struct hfsc_class hc;
3151 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3152 hfsc_install__(netdev, hc.max_rate);
3154 if (!start_queue_dump(netdev, &dump)) {
3158 while (nl_dump_next(&dump, &msg)) {
3159 unsigned int queue_id;
3161 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3162 hfsc_update_queue__(netdev, queue_id, &hc);
3166 nl_dump_done(&dump);
3171 hfsc_tc_destroy(struct tc *tc)
3174 struct hfsc_class *hc, *next;
3176 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3178 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3179 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3188 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3190 const struct hfsc *hfsc;
3191 hfsc = hfsc_get__(netdev);
3192 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3197 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3200 struct hfsc_class class;
3202 hfsc_parse_qdisc_details__(netdev, details, &class);
3203 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3204 tc_make_handle(1, 0), &class);
3207 hfsc_get__(netdev)->max_rate = class.max_rate;
3214 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3215 const struct tc_queue *queue, struct shash *details)
3217 const struct hfsc_class *hc;
3219 hc = hfsc_class_cast__(queue);
3220 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3221 if (hc->min_rate != hc->max_rate) {
3222 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3228 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3229 const struct shash *details)
3232 struct hfsc_class class;
3234 error = hfsc_parse_class_details__(netdev, details, &class);
3239 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3240 tc_make_handle(1, 0xfffe), &class);
3245 hfsc_update_queue__(netdev, queue_id, &class);
3250 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3254 struct hfsc_class *hc;
3256 hc = hfsc_class_cast__(queue);
3257 hfsc = hfsc_get__(netdev);
3259 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3261 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3268 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3269 struct netdev_queue_stats *stats)
3271 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3272 tc_make_handle(1, 0xfffe), NULL, stats);
3276 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3277 const struct ofpbuf *nlmsg,
3278 netdev_dump_queue_stats_cb *cb, void *aux)
3280 struct netdev_queue_stats stats;
3281 unsigned int handle, major, minor;
3284 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3289 major = tc_get_major(handle);
3290 minor = tc_get_minor(handle);
3291 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3292 (*cb)(minor - 1, &stats, aux);
3297 static const struct tc_ops tc_ops_hfsc = {
3298 "hfsc", /* linux_name */
3299 "linux-hfsc", /* ovs_name */
3300 HFSC_N_QUEUES, /* n_queues */
3301 hfsc_tc_install, /* tc_install */
3302 hfsc_tc_load, /* tc_load */
3303 hfsc_tc_destroy, /* tc_destroy */
3304 hfsc_qdisc_get, /* qdisc_get */
3305 hfsc_qdisc_set, /* qdisc_set */
3306 hfsc_class_get, /* class_get */
3307 hfsc_class_set, /* class_set */
3308 hfsc_class_delete, /* class_delete */
3309 hfsc_class_get_stats, /* class_get_stats */
3310 hfsc_class_dump_stats /* class_dump_stats */
3313 /* "linux-default" traffic control class.
3315 * This class represents the default, unnamed Linux qdisc. It corresponds to
3316 * the "" (empty string) QoS type in the OVS database. */
3319 default_install__(struct netdev *netdev)
3321 struct netdev_dev_linux *netdev_dev =
3322 netdev_dev_linux_cast(netdev_get_dev(netdev));
3323 static struct tc *tc;
3326 tc = xmalloc(sizeof *tc);
3327 tc_init(tc, &tc_ops_default);
3329 netdev_dev->tc = tc;
3333 default_tc_install(struct netdev *netdev,
3334 const struct shash *details OVS_UNUSED)
3336 default_install__(netdev);
3341 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3343 default_install__(netdev);
3347 static const struct tc_ops tc_ops_default = {
3348 NULL, /* linux_name */
3353 NULL, /* tc_destroy */
3354 NULL, /* qdisc_get */
3355 NULL, /* qdisc_set */
3356 NULL, /* class_get */
3357 NULL, /* class_set */
3358 NULL, /* class_delete */
3359 NULL, /* class_get_stats */
3360 NULL /* class_dump_stats */
3363 /* "linux-other" traffic control class.
3368 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3370 struct netdev_dev_linux *netdev_dev =
3371 netdev_dev_linux_cast(netdev_get_dev(netdev));
3372 static struct tc *tc;
3375 tc = xmalloc(sizeof *tc);
3376 tc_init(tc, &tc_ops_other);
3378 netdev_dev->tc = tc;
3382 static const struct tc_ops tc_ops_other = {
3383 NULL, /* linux_name */
3384 "linux-other", /* ovs_name */
3386 NULL, /* tc_install */
3388 NULL, /* tc_destroy */
3389 NULL, /* qdisc_get */
3390 NULL, /* qdisc_set */
3391 NULL, /* class_get */
3392 NULL, /* class_set */
3393 NULL, /* class_delete */
3394 NULL, /* class_get_stats */
3395 NULL /* class_dump_stats */
3398 /* Traffic control. */
3400 /* Number of kernel "tc" ticks per second. */
3401 static double ticks_per_s;
3403 /* Number of kernel "jiffies" per second. This is used for the purpose of
3404 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3405 * one jiffy's worth of data.
3407 * There are two possibilities here:
3409 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3410 * approximate range of 100 to 1024. That means that we really need to
3411 * make sure that the qdisc can buffer that much data.
3413 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3414 * has finely granular timers and there's no need to fudge additional room
3415 * for buffers. (There's no extra effort needed to implement that: the
3416 * large 'buffer_hz' is used as a divisor, so practically any number will
3417 * come out as 0 in the division. Small integer results in the case of
3418 * really high dividends won't have any real effect anyhow.)
3420 static unsigned int buffer_hz;
3422 /* Returns tc handle 'major':'minor'. */
3424 tc_make_handle(unsigned int major, unsigned int minor)
3426 return TC_H_MAKE(major << 16, minor);
3429 /* Returns the major number from 'handle'. */
3431 tc_get_major(unsigned int handle)
3433 return TC_H_MAJ(handle) >> 16;
3436 /* Returns the minor number from 'handle'. */
3438 tc_get_minor(unsigned int handle)
3440 return TC_H_MIN(handle);
3443 static struct tcmsg *
3444 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3445 struct ofpbuf *request)
3447 struct tcmsg *tcmsg;
3451 error = get_ifindex(netdev, &ifindex);
3456 ofpbuf_init(request, 512);
3457 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3458 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3459 tcmsg->tcm_family = AF_UNSPEC;
3460 tcmsg->tcm_ifindex = ifindex;
3461 /* Caller should fill in tcmsg->tcm_handle. */
3462 /* Caller should fill in tcmsg->tcm_parent. */
3468 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3470 int error = nl_sock_transact(rtnl_sock, request, replyp);
3471 ofpbuf_uninit(request);
3478 /* The values in psched are not individually very meaningful, but they are
3479 * important. The tables below show some values seen in the wild.
3483 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3484 * (Before that, there are hints that it was 1000000000.)
3486 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3490 * -----------------------------------
3491 * [1] 000c8000 000f4240 000f4240 00000064
3492 * [2] 000003e8 00000400 000f4240 3b9aca00
3493 * [3] 000003e8 00000400 000f4240 3b9aca00
3494 * [4] 000003e8 00000400 000f4240 00000064
3495 * [5] 000003e8 00000040 000f4240 3b9aca00
3496 * [6] 000003e8 00000040 000f4240 000000f9
3498 * a b c d ticks_per_s buffer_hz
3499 * ------- --------- ---------- ------------- ----------- -------------
3500 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3501 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3502 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3503 * [4] 1,000 1,024 1,000,000 100 976,562 100
3504 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3505 * [6] 1,000 64 1,000,000 249 15,625,000 249
3507 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3508 * [2] 2.6.26-1-686-bigmem from Debian lenny
3509 * [3] 2.6.26-2-sparc64 from Debian lenny
3510 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3511 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3512 * [6] 2.6.34 from kernel.org on KVM
3514 static const char fn[] = "/proc/net/psched";
3515 unsigned int a, b, c, d;
3521 stream = fopen(fn, "r");
3523 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3527 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3528 VLOG_WARN("%s: read failed", fn);
3532 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3536 VLOG_WARN("%s: invalid scheduler parameters", fn);
3540 ticks_per_s = (double) a * c / b;
3544 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3547 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3550 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3551 * rate of 'rate' bytes per second. */
3553 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3558 return (rate * ticks) / ticks_per_s;
3561 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3562 * rate of 'rate' bytes per second. */
3564 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3569 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3572 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3573 * a transmission rate of 'rate' bytes per second. */
3575 tc_buffer_per_jiffy(unsigned int rate)
3580 return rate / buffer_hz;
3583 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3584 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3585 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3586 * stores NULL into it if it is absent.
3588 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3591 * Returns 0 if successful, otherwise a positive errno value. */
3593 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3594 struct nlattr **options)
3596 static const struct nl_policy tca_policy[] = {
3597 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3598 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3600 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3602 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3603 tca_policy, ta, ARRAY_SIZE(ta))) {
3604 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3609 *kind = nl_attr_get_string(ta[TCA_KIND]);
3613 *options = ta[TCA_OPTIONS];
3628 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3629 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3630 * into '*options', and its queue statistics into '*stats'. Any of the output
3631 * arguments may be null.
3633 * Returns 0 if successful, otherwise a positive errno value. */
3635 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3636 struct nlattr **options, struct netdev_queue_stats *stats)
3638 static const struct nl_policy tca_policy[] = {
3639 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3640 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3642 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3644 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3645 tca_policy, ta, ARRAY_SIZE(ta))) {
3646 VLOG_WARN_RL(&rl, "failed to parse class message");
3651 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3652 *handlep = tc->tcm_handle;
3656 *options = ta[TCA_OPTIONS];
3660 const struct gnet_stats_queue *gsq;
3661 struct gnet_stats_basic gsb;
3663 static const struct nl_policy stats_policy[] = {
3664 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3665 .min_len = sizeof gsb },
3666 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3667 .min_len = sizeof *gsq },
3669 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3671 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3672 sa, ARRAY_SIZE(sa))) {
3673 VLOG_WARN_RL(&rl, "failed to parse class stats");
3677 /* Alignment issues screw up the length of struct gnet_stats_basic on
3678 * some arch/bitsize combinations. Newer versions of Linux have a
3679 * struct gnet_stats_basic_packed, but we can't depend on that. The
3680 * easiest thing to do is just to make a copy. */
3681 memset(&gsb, 0, sizeof gsb);
3682 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3683 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3684 stats->tx_bytes = gsb.bytes;
3685 stats->tx_packets = gsb.packets;
3687 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3688 stats->tx_errors = gsq->drops;
3698 memset(stats, 0, sizeof *stats);
3703 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3706 tc_query_class(const struct netdev *netdev,
3707 unsigned int handle, unsigned int parent,
3708 struct ofpbuf **replyp)
3710 struct ofpbuf request;
3711 struct tcmsg *tcmsg;
3714 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3718 tcmsg->tcm_handle = handle;
3719 tcmsg->tcm_parent = parent;
3721 error = tc_transact(&request, replyp);
3723 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3724 netdev_get_name(netdev),
3725 tc_get_major(handle), tc_get_minor(handle),
3726 tc_get_major(parent), tc_get_minor(parent),
3732 /* Equivalent to "tc class del dev <name> handle <handle>". */
3734 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3736 struct ofpbuf request;
3737 struct tcmsg *tcmsg;
3740 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3744 tcmsg->tcm_handle = handle;
3745 tcmsg->tcm_parent = 0;
3747 error = tc_transact(&request, NULL);
3749 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3750 netdev_get_name(netdev),
3751 tc_get_major(handle), tc_get_minor(handle),
3757 /* Equivalent to "tc qdisc del dev <name> root". */
3759 tc_del_qdisc(struct netdev *netdev)
3761 struct netdev_dev_linux *netdev_dev =
3762 netdev_dev_linux_cast(netdev_get_dev(netdev));
3763 struct ofpbuf request;
3764 struct tcmsg *tcmsg;
3767 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3771 tcmsg->tcm_handle = tc_make_handle(1, 0);
3772 tcmsg->tcm_parent = TC_H_ROOT;
3774 error = tc_transact(&request, NULL);
3775 if (error == EINVAL) {
3776 /* EINVAL probably means that the default qdisc was in use, in which
3777 * case we've accomplished our purpose. */
3780 if (!error && netdev_dev->tc) {
3781 if (netdev_dev->tc->ops->tc_destroy) {
3782 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3784 netdev_dev->tc = NULL;
3789 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3790 * kernel to determine what they are. Returns 0 if successful, otherwise a
3791 * positive errno value. */
3793 tc_query_qdisc(const struct netdev *netdev)
3795 struct netdev_dev_linux *netdev_dev =
3796 netdev_dev_linux_cast(netdev_get_dev(netdev));
3797 struct ofpbuf request, *qdisc;
3798 const struct tc_ops *ops;
3799 struct tcmsg *tcmsg;
3803 if (netdev_dev->tc) {
3807 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3808 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3809 * 2.6.35 without that fix backported to it.
3811 * To avoid the OOPS, we must not make a request that would attempt to dump
3812 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3813 * few others. There are a few ways that I can see to do this, but most of
3814 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3815 * technique chosen here is to assume that any non-default qdisc that we
3816 * create will have a class with handle 1:0. The built-in qdiscs only have
3817 * a class with handle 0:0.
3819 * We could check for Linux 2.6.35+ and use a more straightforward method
3821 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3825 tcmsg->tcm_handle = tc_make_handle(1, 0);
3826 tcmsg->tcm_parent = 0;
3828 /* Figure out what tc class to instantiate. */
3829 error = tc_transact(&request, &qdisc);
3833 error = tc_parse_qdisc(qdisc, &kind, NULL);
3835 ops = &tc_ops_other;
3837 ops = tc_lookup_linux_name(kind);
3839 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3840 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3842 ops = &tc_ops_other;
3845 } else if (error == ENOENT) {
3846 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3847 * other entity that doesn't have a handle 1:0. We will assume
3848 * that it's the system default qdisc. */
3849 ops = &tc_ops_default;
3852 /* Who knows? Maybe the device got deleted. */
3853 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3854 netdev_get_name(netdev), strerror(error));
3855 ops = &tc_ops_other;
3858 /* Instantiate it. */
3859 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3860 assert((load_error == 0) == (netdev_dev->tc != NULL));
3861 ofpbuf_delete(qdisc);
3863 return error ? error : load_error;
3866 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3867 approximate the time to transmit packets of various lengths. For an MTU of
3868 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3869 represents two possible packet lengths; for a MTU of 513 through 1024, four
3870 possible lengths; and so on.
3872 Returns, for the specified 'mtu', the number of bits that packet lengths
3873 need to be shifted right to fit within such a 256-entry table. */
3875 tc_calc_cell_log(unsigned int mtu)
3880 mtu = ETH_PAYLOAD_MAX;
3882 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3884 for (cell_log = 0; mtu >= 256; cell_log++) {
3891 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3894 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3896 memset(rate, 0, sizeof *rate);
3897 rate->cell_log = tc_calc_cell_log(mtu);
3898 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3899 /* rate->cell_align = 0; */ /* distro headers. */
3900 rate->mpu = ETH_TOTAL_MIN;
3904 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3905 * attribute of the specified "type".
3907 * See tc_calc_cell_log() above for a description of "rtab"s. */
3909 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3914 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3915 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3916 unsigned packet_size = (i + 1) << rate->cell_log;
3917 if (packet_size < rate->mpu) {
3918 packet_size = rate->mpu;
3920 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3924 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3925 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3926 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3929 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3931 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3932 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3935 /* Public utility functions. */
3937 #define COPY_NETDEV_STATS \
3938 dst->rx_packets = src->rx_packets; \
3939 dst->tx_packets = src->tx_packets; \
3940 dst->rx_bytes = src->rx_bytes; \
3941 dst->tx_bytes = src->tx_bytes; \
3942 dst->rx_errors = src->rx_errors; \
3943 dst->tx_errors = src->tx_errors; \
3944 dst->rx_dropped = src->rx_dropped; \
3945 dst->tx_dropped = src->tx_dropped; \
3946 dst->multicast = src->multicast; \
3947 dst->collisions = src->collisions; \
3948 dst->rx_length_errors = src->rx_length_errors; \
3949 dst->rx_over_errors = src->rx_over_errors; \
3950 dst->rx_crc_errors = src->rx_crc_errors; \
3951 dst->rx_frame_errors = src->rx_frame_errors; \
3952 dst->rx_fifo_errors = src->rx_fifo_errors; \
3953 dst->rx_missed_errors = src->rx_missed_errors; \
3954 dst->tx_aborted_errors = src->tx_aborted_errors; \
3955 dst->tx_carrier_errors = src->tx_carrier_errors; \
3956 dst->tx_fifo_errors = src->tx_fifo_errors; \
3957 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3958 dst->tx_window_errors = src->tx_window_errors
3960 /* Copies 'src' into 'dst', performing format conversion in the process. */
3962 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3963 const struct rtnl_link_stats *src)
3968 /* Copies 'src' into 'dst', performing format conversion in the process. */
3970 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3971 const struct rtnl_link_stats64 *src)
3976 /* Copies 'src' into 'dst', performing format conversion in the process. */
3978 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
3979 const struct netdev_stats *src)
3982 dst->rx_compressed = 0;
3983 dst->tx_compressed = 0;
3986 /* Utility functions. */
3989 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3991 /* Policy for RTNLGRP_LINK messages.
3993 * There are *many* more fields in these messages, but currently we only
3994 * care about these fields. */
3995 static const struct nl_policy rtnlgrp_link_policy[] = {
3996 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3997 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3998 .min_len = sizeof(struct rtnl_link_stats) },
4001 struct ofpbuf request;
4002 struct ofpbuf *reply;
4003 struct ifinfomsg *ifi;
4004 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4007 ofpbuf_init(&request, 0);
4008 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4009 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4010 ifi->ifi_family = PF_UNSPEC;
4011 ifi->ifi_index = ifindex;
4012 error = nl_sock_transact(rtnl_sock, &request, &reply);
4013 ofpbuf_uninit(&request);
4018 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4019 rtnlgrp_link_policy,
4020 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4021 ofpbuf_delete(reply);
4025 if (!attrs[IFLA_STATS]) {
4026 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4027 ofpbuf_delete(reply);
4031 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4033 ofpbuf_delete(reply);
4039 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4041 static const char fn[] = "/proc/net/dev";
4046 stream = fopen(fn, "r");
4048 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4053 while (fgets(line, sizeof line, stream)) {
4056 #define X64 "%"SCNu64
4059 X64 X64 X64 X64 X64 X64 X64 "%*u"
4060 X64 X64 X64 X64 X64 X64 X64 "%*u",
4066 &stats->rx_fifo_errors,
4067 &stats->rx_frame_errors,
4073 &stats->tx_fifo_errors,
4075 &stats->tx_carrier_errors) != 15) {
4076 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4077 } else if (!strcmp(devname, netdev_name)) {
4078 stats->rx_length_errors = UINT64_MAX;
4079 stats->rx_over_errors = UINT64_MAX;
4080 stats->rx_crc_errors = UINT64_MAX;
4081 stats->rx_missed_errors = UINT64_MAX;
4082 stats->tx_aborted_errors = UINT64_MAX;
4083 stats->tx_heartbeat_errors = UINT64_MAX;
4084 stats->tx_window_errors = UINT64_MAX;
4090 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4096 get_flags(const struct netdev *netdev, int *flags)
4101 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4103 *flags = ifr.ifr_flags;
4108 set_flags(struct netdev *netdev, int flags)
4112 ifr.ifr_flags = flags;
4113 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4118 do_get_ifindex(const char *netdev_name)
4122 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4123 COVERAGE_INC(netdev_get_ifindex);
4124 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4125 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4126 netdev_name, strerror(errno));
4129 return ifr.ifr_ifindex;
4133 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4135 struct netdev_dev_linux *netdev_dev =
4136 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4138 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4139 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4143 netdev_dev->cache_valid |= VALID_IFINDEX;
4144 netdev_dev->ifindex = ifindex;
4146 *ifindexp = netdev_dev->ifindex;
4151 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4156 memset(&ifr, 0, sizeof ifr);
4157 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4158 COVERAGE_INC(netdev_get_hwaddr);
4159 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4160 /* ENODEV probably means that a vif disappeared asynchronously and
4161 * hasn't been removed from the database yet, so reduce the log level
4162 * to INFO for that case. */
4163 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4164 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4165 netdev_name, strerror(errno));
4168 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4169 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4170 VLOG_WARN("%s device has unknown hardware address family %d",
4171 netdev_name, hwaddr_family);
4173 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4178 set_etheraddr(const char *netdev_name, int hwaddr_family,
4179 const uint8_t mac[ETH_ADDR_LEN])
4183 memset(&ifr, 0, sizeof ifr);
4184 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4185 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4186 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4187 COVERAGE_INC(netdev_set_hwaddr);
4188 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4189 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4190 netdev_name, strerror(errno));
4197 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4198 int cmd, const char *cmd_name)
4202 memset(&ifr, 0, sizeof ifr);
4203 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4204 ifr.ifr_data = (caddr_t) ecmd;
4207 COVERAGE_INC(netdev_ethtool);
4208 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4211 if (errno != EOPNOTSUPP) {
4212 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4213 "failed: %s", cmd_name, name, strerror(errno));
4215 /* The device doesn't support this operation. That's pretty
4216 * common, so there's no point in logging anything. */
4222 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4223 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4225 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4226 const char *flag_name, bool enable)
4228 const char *netdev_name = netdev_get_name(netdev);
4229 struct ethtool_value evalue;
4233 memset(&evalue, 0, sizeof evalue);
4234 error = netdev_linux_do_ethtool(netdev_name,
4235 (struct ethtool_cmd *)&evalue,
4236 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4241 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4242 error = netdev_linux_do_ethtool(netdev_name,
4243 (struct ethtool_cmd *)&evalue,
4244 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4249 memset(&evalue, 0, sizeof evalue);
4250 error = netdev_linux_do_ethtool(netdev_name,
4251 (struct ethtool_cmd *)&evalue,
4252 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4257 if (new_flags != evalue.data) {
4258 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4259 "device %s failed", enable ? "enable" : "disable",
4260 flag_name, netdev_name);
4268 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4269 const char *cmd_name)
4271 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4272 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4273 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4281 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4282 int cmd, const char *cmd_name)
4287 ifr.ifr_addr.sa_family = AF_INET;
4288 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4290 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4291 *ip = sin->sin_addr;
4296 /* Returns an AF_PACKET raw socket or a negative errno value. */
4298 af_packet_sock(void)
4300 static int sock = INT_MIN;
4302 if (sock == INT_MIN) {
4303 sock = socket(AF_PACKET, SOCK_RAW, 0);
4305 set_nonblocking(sock);
4308 VLOG_ERR("failed to create packet socket: %s", strerror(errno));