2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct shash *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_dev_linux {
356 struct netdev_dev netdev_dev;
358 struct shash_node *shash_node;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
380 int ether_addr_error; /* Cached error code from set/get etheraddr. */
381 int netdev_policing_error; /* Cached error code from set policing. */
383 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
387 struct tap_state tap;
391 struct netdev_linux {
392 struct netdev netdev;
396 /* Sockets used for ioctl operations. */
397 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
399 /* A Netlink routing socket that is not subscribed to any multicast groups. */
400 static struct nl_sock *rtnl_sock;
402 /* This is set pretty low because we probably won't learn anything from the
403 * additional log messages. */
404 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
406 static int netdev_linux_init(void);
408 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
409 int cmd, const char *cmd_name);
410 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
411 const char *cmd_name);
412 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
413 int cmd, const char *cmd_name);
414 static int get_flags(const struct netdev_dev *, unsigned int *flags);
415 static int set_flags(struct netdev *, unsigned int flags);
416 static int do_get_ifindex(const char *netdev_name);
417 static int get_ifindex(const struct netdev *, int *ifindexp);
418 static int do_set_addr(struct netdev *netdev,
419 int ioctl_nr, const char *ioctl_name,
420 struct in_addr addr);
421 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
422 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
423 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
424 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
425 static int af_packet_sock(void);
426 static void netdev_linux_miimon_run(void);
427 static void netdev_linux_miimon_wait(void);
430 is_netdev_linux_class(const struct netdev_class *netdev_class)
432 return netdev_class->init == netdev_linux_init;
435 static struct netdev_dev_linux *
436 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
438 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439 assert(is_netdev_linux_class(netdev_class));
441 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
444 static struct netdev_linux *
445 netdev_linux_cast(const struct netdev *netdev)
447 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
448 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
449 assert(is_netdev_linux_class(netdev_class));
451 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
455 netdev_linux_init(void)
457 static int status = -1;
459 /* Create AF_INET socket. */
460 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
461 status = af_inet_sock >= 0 ? 0 : errno;
463 VLOG_ERR("failed to create inet socket: %s", strerror(status));
466 /* Create rtnetlink socket. */
468 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
470 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
479 netdev_linux_run(void)
481 rtnetlink_link_run();
482 netdev_linux_miimon_run();
486 netdev_linux_wait(void)
488 rtnetlink_link_wait();
489 netdev_linux_miimon_wait();
493 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
498 if (netdev_dev->cache_valid & VALID_DRVINFO) {
502 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
503 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
504 (struct ethtool_cmd *)&netdev_dev->drvinfo,
508 netdev_dev->cache_valid |= VALID_DRVINFO;
514 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
515 unsigned int ifi_flags,
519 if (!dev->change_seq) {
523 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
524 dev->carrier_resets++;
526 dev->ifi_flags = ifi_flags;
528 dev->cache_valid &= mask;
532 netdev_dev_linux_update(struct netdev_dev_linux *dev,
533 const struct rtnetlink_link_change *change)
535 if (change->nlmsg_type == RTM_NEWLINK) {
537 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
540 dev->mtu = change->mtu;
541 dev->cache_valid |= VALID_MTU;
542 dev->netdev_mtu_error = 0;
545 if (!eth_addr_is_zero(change->addr)) {
546 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
547 dev->cache_valid |= VALID_ETHERADDR;
548 dev->ether_addr_error = 0;
552 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
557 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
558 void *aux OVS_UNUSED)
560 struct netdev_dev_linux *dev;
562 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
564 const struct netdev_class *netdev_class =
565 netdev_dev_get_class(base_dev);
567 if (is_netdev_linux_class(netdev_class)) {
568 dev = netdev_dev_linux_cast(base_dev);
569 netdev_dev_linux_update(dev, change);
573 struct shash device_shash;
574 struct shash_node *node;
576 shash_init(&device_shash);
577 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
578 SHASH_FOR_EACH (node, &device_shash) {
583 get_flags(&dev->netdev_dev, &flags);
584 netdev_dev_linux_changed(dev, flags, 0);
586 shash_destroy(&device_shash);
591 cache_notifier_ref(void)
593 if (!cache_notifier_refcount) {
594 assert(!netdev_linux_cache_notifier);
596 netdev_linux_cache_notifier =
597 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
599 if (!netdev_linux_cache_notifier) {
603 cache_notifier_refcount++;
609 cache_notifier_unref(void)
611 assert(cache_notifier_refcount > 0);
612 if (!--cache_notifier_refcount) {
613 assert(netdev_linux_cache_notifier);
614 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
615 netdev_linux_cache_notifier = NULL;
619 /* Creates system and internal devices. */
621 netdev_linux_create(const struct netdev_class *class, const char *name,
622 struct netdev_dev **netdev_devp)
624 struct netdev_dev_linux *netdev_dev;
627 error = cache_notifier_ref();
632 netdev_dev = xzalloc(sizeof *netdev_dev);
633 netdev_dev->change_seq = 1;
634 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
635 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
637 *netdev_devp = &netdev_dev->netdev_dev;
641 /* For most types of netdevs we open the device for each call of
642 * netdev_open(). However, this is not the case with tap devices,
643 * since it is only possible to open the device once. In this
644 * situation we share a single file descriptor, and consequently
645 * buffers, across all readers. Therefore once data is read it will
646 * be unavailable to other reads for tap devices. */
648 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
649 const char *name, struct netdev_dev **netdev_devp)
651 struct netdev_dev_linux *netdev_dev;
652 struct tap_state *state;
653 static const char tap_dev[] = "/dev/net/tun";
657 netdev_dev = xzalloc(sizeof *netdev_dev);
658 state = &netdev_dev->state.tap;
660 error = cache_notifier_ref();
665 /* Open tap device. */
666 state->fd = open(tap_dev, O_RDWR);
669 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
670 goto error_unref_notifier;
673 /* Create tap device. */
674 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
675 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
676 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
677 VLOG_WARN("%s: creating tap device failed: %s", name,
680 goto error_unref_notifier;
683 /* Make non-blocking. */
684 error = set_nonblocking(state->fd);
686 goto error_unref_notifier;
689 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
690 *netdev_devp = &netdev_dev->netdev_dev;
693 error_unref_notifier:
694 cache_notifier_unref();
701 destroy_tap(struct netdev_dev_linux *netdev_dev)
703 struct tap_state *state = &netdev_dev->state.tap;
705 if (state->fd >= 0) {
710 /* Destroys the netdev device 'netdev_dev_'. */
712 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
714 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
715 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
717 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
718 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
721 if (class == &netdev_tap_class) {
722 destroy_tap(netdev_dev);
726 cache_notifier_unref();
730 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
732 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
733 struct netdev_linux *netdev;
734 enum netdev_flags flags;
737 /* Allocate network device. */
738 netdev = xzalloc(sizeof *netdev);
740 netdev_init(&netdev->netdev, netdev_dev_);
742 /* Verify that the device really exists, by attempting to read its flags.
743 * (The flags might be cached, in which case this won't actually do an
746 * Don't do this for "internal" netdevs, though, because those have to be
747 * created as netdev objects before they exist in the kernel, because
748 * creating them in the kernel happens by passing a netdev object to
749 * dpif_port_add(). */
750 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
751 error = netdev_get_flags(&netdev->netdev, &flags);
752 if (error == ENODEV) {
757 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
758 !netdev_dev->state.tap.opened) {
760 /* We assume that the first user of the tap device is the primary user
761 * and give them the tap FD. Subsequent users probably just expect
762 * this to be a system device so open it normally to avoid send/receive
763 * directions appearing to be reversed. */
764 netdev->fd = netdev_dev->state.tap.fd;
765 netdev_dev->state.tap.opened = true;
768 *netdevp = &netdev->netdev;
772 netdev_uninit(&netdev->netdev, true);
776 /* Closes and destroys 'netdev'. */
778 netdev_linux_close(struct netdev *netdev_)
780 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
782 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
789 netdev_linux_listen(struct netdev *netdev_)
791 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
792 struct sockaddr_ll sll;
797 if (netdev->fd >= 0) {
801 /* Create file descriptor. */
802 fd = socket(PF_PACKET, SOCK_RAW, 0);
805 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
809 /* Set non-blocking mode. */
810 error = set_nonblocking(fd);
815 /* Get ethernet device index. */
816 error = get_ifindex(&netdev->netdev, &ifindex);
821 /* Bind to specific ethernet device. */
822 memset(&sll, 0, sizeof sll);
823 sll.sll_family = AF_PACKET;
824 sll.sll_ifindex = ifindex;
825 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
826 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
828 VLOG_ERR("%s: failed to bind raw socket (%s)",
829 netdev_get_name(netdev_), strerror(error));
844 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
846 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
848 if (netdev->fd < 0) {
849 /* Device is not listening. */
856 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
857 ? read(netdev->fd, data, size)
858 : recv(netdev->fd, data, size, MSG_TRUNC));
860 return retval <= size ? retval : -EMSGSIZE;
861 } else if (errno != EINTR) {
862 if (errno != EAGAIN) {
863 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
864 strerror(errno), netdev_get_name(netdev_));
871 /* Registers with the poll loop to wake up from the next call to poll_block()
872 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
874 netdev_linux_recv_wait(struct netdev *netdev_)
876 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
877 if (netdev->fd >= 0) {
878 poll_fd_wait(netdev->fd, POLLIN);
882 /* Discards all packets waiting to be received from 'netdev'. */
884 netdev_linux_drain(struct netdev *netdev_)
886 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
887 if (netdev->fd < 0) {
889 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
891 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
892 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
896 drain_fd(netdev->fd, ifr.ifr_qlen);
899 return drain_rcvbuf(netdev->fd);
903 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
904 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
905 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
906 * the packet is too big or too small to transmit on the device.
908 * The caller retains ownership of 'buffer' in all cases.
910 * The kernel maintains a packet transmission queue, so the caller is not
911 * expected to do additional queuing of packets. */
913 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
915 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
919 if (netdev->fd < 0) {
920 /* Use our AF_PACKET socket to send to this device. */
921 struct sockaddr_ll sll;
928 sock = af_packet_sock();
933 error = get_ifindex(netdev_, &ifindex);
938 /* We don't bother setting most fields in sockaddr_ll because the
939 * kernel ignores them for SOCK_RAW. */
940 memset(&sll, 0, sizeof sll);
941 sll.sll_family = AF_PACKET;
942 sll.sll_ifindex = ifindex;
944 iov.iov_base = (void *) data;
948 msg.msg_namelen = sizeof sll;
951 msg.msg_control = NULL;
952 msg.msg_controllen = 0;
955 retval = sendmsg(sock, &msg, 0);
957 /* Use the netdev's own fd to send to this device. This is
958 * essential for tap devices, because packets sent to a tap device
959 * with an AF_PACKET socket will loop back to be *received* again
960 * on the tap device. */
961 retval = write(netdev->fd, data, size);
965 /* The Linux AF_PACKET implementation never blocks waiting for room
966 * for packets, instead returning ENOBUFS. Translate this into
967 * EAGAIN for the caller. */
968 if (errno == ENOBUFS) {
970 } else if (errno == EINTR) {
972 } else if (errno != EAGAIN) {
973 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
974 netdev_get_name(netdev_), strerror(errno));
977 } else if (retval != size) {
978 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
979 "%zu) on %s", retval, size, netdev_get_name(netdev_));
987 /* Registers with the poll loop to wake up from the next call to poll_block()
988 * when the packet transmission queue has sufficient room to transmit a packet
989 * with netdev_send().
991 * The kernel maintains a packet transmission queue, so the client is not
992 * expected to do additional queuing of packets. Thus, this function is
993 * unlikely to ever be used. It is included for completeness. */
995 netdev_linux_send_wait(struct netdev *netdev_)
997 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
998 if (netdev->fd < 0) {
1000 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1001 poll_fd_wait(netdev->fd, POLLOUT);
1003 /* TAP device always accepts packets.*/
1004 poll_immediate_wake();
1008 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1009 * otherwise a positive errno value. */
1011 netdev_linux_set_etheraddr(struct netdev *netdev_,
1012 const uint8_t mac[ETH_ADDR_LEN])
1014 struct netdev_dev_linux *netdev_dev =
1015 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1018 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1019 if (netdev_dev->ether_addr_error) {
1020 return netdev_dev->ether_addr_error;
1022 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1025 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1028 error = set_etheraddr(netdev_get_name(netdev_), mac);
1029 if (!error || error == ENODEV) {
1030 netdev_dev->ether_addr_error = error;
1031 netdev_dev->cache_valid |= VALID_ETHERADDR;
1033 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1040 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1042 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1043 uint8_t mac[ETH_ADDR_LEN])
1045 struct netdev_dev_linux *netdev_dev =
1046 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1048 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1049 int error = get_etheraddr(netdev_get_name(netdev_),
1050 netdev_dev->etheraddr);
1052 netdev_dev->ether_addr_error = error;
1053 netdev_dev->cache_valid |= VALID_ETHERADDR;
1056 if (!netdev_dev->ether_addr_error) {
1057 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1060 return netdev_dev->ether_addr_error;
1063 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1064 * in bytes, not including the hardware header; thus, this is typically 1500
1065 * bytes for Ethernet devices. */
1067 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1069 struct netdev_dev_linux *netdev_dev =
1070 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1071 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1075 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1076 SIOCGIFMTU, "SIOCGIFMTU");
1078 netdev_dev->netdev_mtu_error = error;
1079 netdev_dev->mtu = ifr.ifr_mtu;
1080 netdev_dev->cache_valid |= VALID_MTU;
1083 if (!netdev_dev->netdev_mtu_error) {
1084 *mtup = netdev_dev->mtu;
1086 return netdev_dev->netdev_mtu_error;
1089 /* Sets the maximum size of transmitted (MTU) for given device using linux
1090 * networking ioctl interface.
1093 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1095 struct netdev_dev_linux *netdev_dev =
1096 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1100 if (netdev_dev->cache_valid & VALID_MTU) {
1101 if (netdev_dev->netdev_mtu_error) {
1102 return netdev_dev->netdev_mtu_error;
1104 if (netdev_dev->mtu == mtu) {
1107 netdev_dev->cache_valid &= ~VALID_MTU;
1110 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1111 SIOCSIFMTU, "SIOCSIFMTU");
1112 if (!error || error == ENODEV) {
1113 netdev_dev->netdev_mtu_error = error;
1114 netdev_dev->mtu = ifr.ifr_mtu;
1115 netdev_dev->cache_valid |= VALID_MTU;
1120 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1121 * On failure, returns a negative errno value. */
1123 netdev_linux_get_ifindex(const struct netdev *netdev)
1127 error = get_ifindex(netdev, &ifindex);
1128 return error ? -error : ifindex;
1132 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1134 struct netdev_dev_linux *netdev_dev =
1135 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1137 if (netdev_dev->miimon_interval > 0) {
1138 *carrier = netdev_dev->miimon;
1140 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1146 static long long int
1147 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1149 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1153 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1154 struct mii_ioctl_data *data)
1159 memset(&ifr, 0, sizeof ifr);
1160 memcpy(&ifr.ifr_data, data, sizeof *data);
1161 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1162 memcpy(data, &ifr.ifr_data, sizeof *data);
1168 netdev_linux_get_miimon(const char *name, bool *miimon)
1170 struct mii_ioctl_data data;
1175 memset(&data, 0, sizeof data);
1176 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1178 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1179 data.reg_num = MII_BMSR;
1180 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1184 *miimon = !!(data.val_out & BMSR_LSTATUS);
1186 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1189 struct ethtool_cmd ecmd;
1191 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1194 memset(&ecmd, 0, sizeof ecmd);
1195 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1198 struct ethtool_value eval;
1200 memcpy(&eval, &ecmd, sizeof eval);
1201 *miimon = !!eval.data;
1203 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1211 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1212 long long int interval)
1214 struct netdev_dev_linux *netdev_dev;
1216 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1218 interval = interval > 0 ? MAX(interval, 100) : 0;
1219 if (netdev_dev->miimon_interval != interval) {
1220 netdev_dev->miimon_interval = interval;
1221 timer_set_expired(&netdev_dev->miimon_timer);
1228 netdev_linux_miimon_run(void)
1230 struct shash device_shash;
1231 struct shash_node *node;
1233 shash_init(&device_shash);
1234 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1235 SHASH_FOR_EACH (node, &device_shash) {
1236 struct netdev_dev_linux *dev = node->data;
1239 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1243 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1244 if (miimon != dev->miimon) {
1245 dev->miimon = miimon;
1246 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1249 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1252 shash_destroy(&device_shash);
1256 netdev_linux_miimon_wait(void)
1258 struct shash device_shash;
1259 struct shash_node *node;
1261 shash_init(&device_shash);
1262 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1263 SHASH_FOR_EACH (node, &device_shash) {
1264 struct netdev_dev_linux *dev = node->data;
1266 if (dev->miimon_interval > 0) {
1267 timer_wait(&dev->miimon_timer);
1270 shash_destroy(&device_shash);
1273 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1274 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1277 check_for_working_netlink_stats(void)
1279 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1280 * preferable, so if that works, we'll use it. */
1281 int ifindex = do_get_ifindex("lo");
1283 VLOG_WARN("failed to get ifindex for lo, "
1284 "obtaining netdev stats from proc");
1287 struct netdev_stats stats;
1288 int error = get_stats_via_netlink(ifindex, &stats);
1290 VLOG_DBG("obtaining netdev stats via rtnetlink");
1293 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1294 "via proc (you are probably running a pre-2.6.19 "
1295 "kernel)", strerror(error));
1302 swap_uint64(uint64_t *a, uint64_t *b)
1310 get_stats_via_vport(const struct netdev *netdev_,
1311 struct netdev_stats *stats)
1313 struct netdev_dev_linux *netdev_dev =
1314 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1316 if (!netdev_dev->vport_stats_error ||
1317 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1320 error = netdev_vport_get_stats(netdev_, stats);
1322 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1323 "(%s)", netdev_get_name(netdev_), strerror(error));
1325 netdev_dev->vport_stats_error = error;
1326 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1331 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1332 struct netdev_stats *stats)
1334 static int use_netlink_stats = -1;
1337 if (use_netlink_stats < 0) {
1338 use_netlink_stats = check_for_working_netlink_stats();
1341 if (use_netlink_stats) {
1344 error = get_ifindex(netdev_, &ifindex);
1346 error = get_stats_via_netlink(ifindex, stats);
1349 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1353 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1354 netdev_get_name(netdev_), error);
1360 /* Retrieves current device stats for 'netdev-linux'. */
1362 netdev_linux_get_stats(const struct netdev *netdev_,
1363 struct netdev_stats *stats)
1365 struct netdev_dev_linux *netdev_dev =
1366 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1367 struct netdev_stats dev_stats;
1370 get_stats_via_vport(netdev_, stats);
1372 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1375 if (netdev_dev->vport_stats_error) {
1382 if (netdev_dev->vport_stats_error) {
1383 /* stats not available from OVS then use ioctl stats. */
1386 stats->rx_errors += dev_stats.rx_errors;
1387 stats->tx_errors += dev_stats.tx_errors;
1388 stats->rx_dropped += dev_stats.rx_dropped;
1389 stats->tx_dropped += dev_stats.tx_dropped;
1390 stats->multicast += dev_stats.multicast;
1391 stats->collisions += dev_stats.collisions;
1392 stats->rx_length_errors += dev_stats.rx_length_errors;
1393 stats->rx_over_errors += dev_stats.rx_over_errors;
1394 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1395 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1396 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1397 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1398 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1399 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1400 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1401 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1402 stats->tx_window_errors += dev_stats.tx_window_errors;
1407 /* Retrieves current device stats for 'netdev-tap' netdev or
1408 * netdev-internal. */
1410 netdev_tap_get_stats(const struct netdev *netdev_,
1411 struct netdev_stats *stats)
1413 struct netdev_dev_linux *netdev_dev =
1414 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1415 struct netdev_stats dev_stats;
1418 get_stats_via_vport(netdev_, stats);
1420 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1422 if (netdev_dev->vport_stats_error) {
1429 /* If this port is an internal port then the transmit and receive stats
1430 * will appear to be swapped relative to the other ports since we are the
1431 * one sending the data, not a remote computer. For consistency, we swap
1432 * them back here. This does not apply if we are getting stats from the
1433 * vport layer because it always tracks stats from the perspective of the
1435 if (netdev_dev->vport_stats_error) {
1437 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1438 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1439 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1440 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1441 stats->rx_length_errors = 0;
1442 stats->rx_over_errors = 0;
1443 stats->rx_crc_errors = 0;
1444 stats->rx_frame_errors = 0;
1445 stats->rx_fifo_errors = 0;
1446 stats->rx_missed_errors = 0;
1447 stats->tx_aborted_errors = 0;
1448 stats->tx_carrier_errors = 0;
1449 stats->tx_fifo_errors = 0;
1450 stats->tx_heartbeat_errors = 0;
1451 stats->tx_window_errors = 0;
1453 stats->rx_dropped += dev_stats.tx_dropped;
1454 stats->tx_dropped += dev_stats.rx_dropped;
1456 stats->rx_errors += dev_stats.tx_errors;
1457 stats->tx_errors += dev_stats.rx_errors;
1459 stats->multicast += dev_stats.multicast;
1460 stats->collisions += dev_stats.collisions;
1466 netdev_internal_get_stats(const struct netdev *netdev_,
1467 struct netdev_stats *stats)
1469 struct netdev_dev_linux *netdev_dev =
1470 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1472 get_stats_via_vport(netdev_, stats);
1473 return netdev_dev->vport_stats_error;
1476 /* Stores the features supported by 'netdev' into each of '*current',
1477 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1478 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1481 netdev_linux_get_features(const struct netdev *netdev,
1482 enum netdev_features *current,
1483 enum netdev_features *advertised,
1484 enum netdev_features *supported,
1485 enum netdev_features *peer)
1487 struct ethtool_cmd ecmd;
1491 memset(&ecmd, 0, sizeof ecmd);
1492 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1493 ETHTOOL_GSET, "ETHTOOL_GSET");
1498 /* Supported features. */
1500 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1501 *supported |= NETDEV_F_10MB_HD;
1503 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1504 *supported |= NETDEV_F_10MB_FD;
1506 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1507 *supported |= NETDEV_F_100MB_HD;
1509 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1510 *supported |= NETDEV_F_100MB_FD;
1512 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1513 *supported |= NETDEV_F_1GB_HD;
1515 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1516 *supported |= NETDEV_F_1GB_FD;
1518 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1519 *supported |= NETDEV_F_10GB_FD;
1521 if (ecmd.supported & SUPPORTED_TP) {
1522 *supported |= NETDEV_F_COPPER;
1524 if (ecmd.supported & SUPPORTED_FIBRE) {
1525 *supported |= NETDEV_F_FIBER;
1527 if (ecmd.supported & SUPPORTED_Autoneg) {
1528 *supported |= NETDEV_F_AUTONEG;
1530 if (ecmd.supported & SUPPORTED_Pause) {
1531 *supported |= NETDEV_F_PAUSE;
1533 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1534 *supported |= NETDEV_F_PAUSE_ASYM;
1537 /* Advertised features. */
1539 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1540 *advertised |= NETDEV_F_10MB_HD;
1542 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1543 *advertised |= NETDEV_F_10MB_FD;
1545 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1546 *advertised |= NETDEV_F_100MB_HD;
1548 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1549 *advertised |= NETDEV_F_100MB_FD;
1551 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1552 *advertised |= NETDEV_F_1GB_HD;
1554 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1555 *advertised |= NETDEV_F_1GB_FD;
1557 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1558 *advertised |= NETDEV_F_10GB_FD;
1560 if (ecmd.advertising & ADVERTISED_TP) {
1561 *advertised |= NETDEV_F_COPPER;
1563 if (ecmd.advertising & ADVERTISED_FIBRE) {
1564 *advertised |= NETDEV_F_FIBER;
1566 if (ecmd.advertising & ADVERTISED_Autoneg) {
1567 *advertised |= NETDEV_F_AUTONEG;
1569 if (ecmd.advertising & ADVERTISED_Pause) {
1570 *advertised |= NETDEV_F_PAUSE;
1572 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1573 *advertised |= NETDEV_F_PAUSE_ASYM;
1576 /* Current settings. */
1578 if (speed == SPEED_10) {
1579 *current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1580 } else if (speed == SPEED_100) {
1581 *current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1582 } else if (speed == SPEED_1000) {
1583 *current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1584 } else if (speed == SPEED_10000) {
1585 *current = NETDEV_F_10GB_FD;
1586 } else if (speed == 40000) {
1587 *current = NETDEV_F_40GB_FD;
1588 } else if (speed == 100000) {
1589 *current = NETDEV_F_100GB_FD;
1590 } else if (speed == 1000000) {
1591 *current = NETDEV_F_1TB_FD;
1596 if (ecmd.port == PORT_TP) {
1597 *current |= NETDEV_F_COPPER;
1598 } else if (ecmd.port == PORT_FIBRE) {
1599 *current |= NETDEV_F_FIBER;
1603 *current |= NETDEV_F_AUTONEG;
1606 /* Peer advertisements. */
1607 *peer = 0; /* XXX */
1612 /* Set the features advertised by 'netdev' to 'advertise'. */
1614 netdev_linux_set_advertisements(struct netdev *netdev,
1615 enum netdev_features advertise)
1617 struct ethtool_cmd ecmd;
1620 memset(&ecmd, 0, sizeof ecmd);
1621 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1622 ETHTOOL_GSET, "ETHTOOL_GSET");
1627 ecmd.advertising = 0;
1628 if (advertise & NETDEV_F_10MB_HD) {
1629 ecmd.advertising |= ADVERTISED_10baseT_Half;
1631 if (advertise & NETDEV_F_10MB_FD) {
1632 ecmd.advertising |= ADVERTISED_10baseT_Full;
1634 if (advertise & NETDEV_F_100MB_HD) {
1635 ecmd.advertising |= ADVERTISED_100baseT_Half;
1637 if (advertise & NETDEV_F_100MB_FD) {
1638 ecmd.advertising |= ADVERTISED_100baseT_Full;
1640 if (advertise & NETDEV_F_1GB_HD) {
1641 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1643 if (advertise & NETDEV_F_1GB_FD) {
1644 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1646 if (advertise & NETDEV_F_10GB_FD) {
1647 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1649 if (advertise & NETDEV_F_COPPER) {
1650 ecmd.advertising |= ADVERTISED_TP;
1652 if (advertise & NETDEV_F_FIBER) {
1653 ecmd.advertising |= ADVERTISED_FIBRE;
1655 if (advertise & NETDEV_F_AUTONEG) {
1656 ecmd.advertising |= ADVERTISED_Autoneg;
1658 if (advertise & NETDEV_F_PAUSE) {
1659 ecmd.advertising |= ADVERTISED_Pause;
1661 if (advertise & NETDEV_F_PAUSE_ASYM) {
1662 ecmd.advertising |= ADVERTISED_Asym_Pause;
1664 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1665 ETHTOOL_SSET, "ETHTOOL_SSET");
1668 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1669 * successful, otherwise a positive errno value. */
1671 netdev_linux_set_policing(struct netdev *netdev,
1672 uint32_t kbits_rate, uint32_t kbits_burst)
1674 struct netdev_dev_linux *netdev_dev =
1675 netdev_dev_linux_cast(netdev_get_dev(netdev));
1676 const char *netdev_name = netdev_get_name(netdev);
1680 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1681 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1682 : kbits_burst); /* Stick with user-specified value. */
1684 if (netdev_dev->cache_valid & VALID_POLICING) {
1685 if (netdev_dev->netdev_policing_error) {
1686 return netdev_dev->netdev_policing_error;
1689 if (netdev_dev->kbits_rate == kbits_rate &&
1690 netdev_dev->kbits_burst == kbits_burst) {
1691 /* Assume that settings haven't changed since we last set them. */
1694 netdev_dev->cache_valid &= ~VALID_POLICING;
1697 COVERAGE_INC(netdev_set_policing);
1698 /* Remove any existing ingress qdisc. */
1699 error = tc_add_del_ingress_qdisc(netdev, false);
1701 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1702 netdev_name, strerror(error));
1707 error = tc_add_del_ingress_qdisc(netdev, true);
1709 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1710 netdev_name, strerror(error));
1714 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1716 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1717 netdev_name, strerror(error));
1722 netdev_dev->kbits_rate = kbits_rate;
1723 netdev_dev->kbits_burst = kbits_burst;
1726 if (!error || error == ENODEV) {
1727 netdev_dev->netdev_policing_error = error;
1728 netdev_dev->cache_valid |= VALID_POLICING;
1734 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1737 const struct tc_ops **opsp;
1739 for (opsp = tcs; *opsp != NULL; opsp++) {
1740 const struct tc_ops *ops = *opsp;
1741 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1742 sset_add(types, ops->ovs_name);
1748 static const struct tc_ops *
1749 tc_lookup_ovs_name(const char *name)
1751 const struct tc_ops **opsp;
1753 for (opsp = tcs; *opsp != NULL; opsp++) {
1754 const struct tc_ops *ops = *opsp;
1755 if (!strcmp(name, ops->ovs_name)) {
1762 static const struct tc_ops *
1763 tc_lookup_linux_name(const char *name)
1765 const struct tc_ops **opsp;
1767 for (opsp = tcs; *opsp != NULL; opsp++) {
1768 const struct tc_ops *ops = *opsp;
1769 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1776 static struct tc_queue *
1777 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1780 struct netdev_dev_linux *netdev_dev =
1781 netdev_dev_linux_cast(netdev_get_dev(netdev));
1782 struct tc_queue *queue;
1784 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1785 if (queue->queue_id == queue_id) {
1792 static struct tc_queue *
1793 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1795 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1799 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1801 struct netdev_qos_capabilities *caps)
1803 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1807 caps->n_queues = ops->n_queues;
1812 netdev_linux_get_qos(const struct netdev *netdev,
1813 const char **typep, struct shash *details)
1815 struct netdev_dev_linux *netdev_dev =
1816 netdev_dev_linux_cast(netdev_get_dev(netdev));
1819 error = tc_query_qdisc(netdev);
1824 *typep = netdev_dev->tc->ops->ovs_name;
1825 return (netdev_dev->tc->ops->qdisc_get
1826 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1831 netdev_linux_set_qos(struct netdev *netdev,
1832 const char *type, const struct shash *details)
1834 struct netdev_dev_linux *netdev_dev =
1835 netdev_dev_linux_cast(netdev_get_dev(netdev));
1836 const struct tc_ops *new_ops;
1839 new_ops = tc_lookup_ovs_name(type);
1840 if (!new_ops || !new_ops->tc_install) {
1844 error = tc_query_qdisc(netdev);
1849 if (new_ops == netdev_dev->tc->ops) {
1850 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1852 /* Delete existing qdisc. */
1853 error = tc_del_qdisc(netdev);
1857 assert(netdev_dev->tc == NULL);
1859 /* Install new qdisc. */
1860 error = new_ops->tc_install(netdev, details);
1861 assert((error == 0) == (netdev_dev->tc != NULL));
1868 netdev_linux_get_queue(const struct netdev *netdev,
1869 unsigned int queue_id, struct shash *details)
1871 struct netdev_dev_linux *netdev_dev =
1872 netdev_dev_linux_cast(netdev_get_dev(netdev));
1875 error = tc_query_qdisc(netdev);
1879 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1881 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1887 netdev_linux_set_queue(struct netdev *netdev,
1888 unsigned int queue_id, const struct shash *details)
1890 struct netdev_dev_linux *netdev_dev =
1891 netdev_dev_linux_cast(netdev_get_dev(netdev));
1894 error = tc_query_qdisc(netdev);
1897 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1898 || !netdev_dev->tc->ops->class_set) {
1902 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1906 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1908 struct netdev_dev_linux *netdev_dev =
1909 netdev_dev_linux_cast(netdev_get_dev(netdev));
1912 error = tc_query_qdisc(netdev);
1915 } else if (!netdev_dev->tc->ops->class_delete) {
1918 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1920 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1926 netdev_linux_get_queue_stats(const struct netdev *netdev,
1927 unsigned int queue_id,
1928 struct netdev_queue_stats *stats)
1930 struct netdev_dev_linux *netdev_dev =
1931 netdev_dev_linux_cast(netdev_get_dev(netdev));
1934 error = tc_query_qdisc(netdev);
1937 } else if (!netdev_dev->tc->ops->class_get_stats) {
1940 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1942 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1948 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1950 struct ofpbuf request;
1951 struct tcmsg *tcmsg;
1953 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1957 tcmsg->tcm_parent = 0;
1958 nl_dump_start(dump, rtnl_sock, &request);
1959 ofpbuf_uninit(&request);
1964 netdev_linux_dump_queues(const struct netdev *netdev,
1965 netdev_dump_queues_cb *cb, void *aux)
1967 struct netdev_dev_linux *netdev_dev =
1968 netdev_dev_linux_cast(netdev_get_dev(netdev));
1969 struct tc_queue *queue;
1970 struct shash details;
1974 error = tc_query_qdisc(netdev);
1977 } else if (!netdev_dev->tc->ops->class_get) {
1982 shash_init(&details);
1983 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1984 shash_clear(&details);
1986 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1988 (*cb)(queue->queue_id, &details, aux);
1993 shash_destroy(&details);
1999 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2000 netdev_dump_queue_stats_cb *cb, void *aux)
2002 struct netdev_dev_linux *netdev_dev =
2003 netdev_dev_linux_cast(netdev_get_dev(netdev));
2004 struct nl_dump dump;
2009 error = tc_query_qdisc(netdev);
2012 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2017 if (!start_queue_dump(netdev, &dump)) {
2020 while (nl_dump_next(&dump, &msg)) {
2021 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2027 error = nl_dump_done(&dump);
2028 return error ? error : last_error;
2032 netdev_linux_get_in4(const struct netdev *netdev_,
2033 struct in_addr *address, struct in_addr *netmask)
2035 struct netdev_dev_linux *netdev_dev =
2036 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2038 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2041 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2042 SIOCGIFADDR, "SIOCGIFADDR");
2047 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2048 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2053 netdev_dev->cache_valid |= VALID_IN4;
2055 *address = netdev_dev->address;
2056 *netmask = netdev_dev->netmask;
2057 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2061 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2062 struct in_addr netmask)
2064 struct netdev_dev_linux *netdev_dev =
2065 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2068 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2070 netdev_dev->cache_valid |= VALID_IN4;
2071 netdev_dev->address = address;
2072 netdev_dev->netmask = netmask;
2073 if (address.s_addr != INADDR_ANY) {
2074 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2075 "SIOCSIFNETMASK", netmask);
2082 parse_if_inet6_line(const char *line,
2083 struct in6_addr *in6, char ifname[16 + 1])
2085 uint8_t *s6 = in6->s6_addr;
2086 #define X8 "%2"SCNx8
2088 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2089 "%*x %*x %*x %*x %16s\n",
2090 &s6[0], &s6[1], &s6[2], &s6[3],
2091 &s6[4], &s6[5], &s6[6], &s6[7],
2092 &s6[8], &s6[9], &s6[10], &s6[11],
2093 &s6[12], &s6[13], &s6[14], &s6[15],
2097 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2098 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2100 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2102 struct netdev_dev_linux *netdev_dev =
2103 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2104 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2108 netdev_dev->in6 = in6addr_any;
2110 file = fopen("/proc/net/if_inet6", "r");
2112 const char *name = netdev_get_name(netdev_);
2113 while (fgets(line, sizeof line, file)) {
2114 struct in6_addr in6_tmp;
2115 char ifname[16 + 1];
2116 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2117 && !strcmp(name, ifname))
2119 netdev_dev->in6 = in6_tmp;
2125 netdev_dev->cache_valid |= VALID_IN6;
2127 *in6 = netdev_dev->in6;
2132 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2134 struct sockaddr_in sin;
2135 memset(&sin, 0, sizeof sin);
2136 sin.sin_family = AF_INET;
2137 sin.sin_addr = addr;
2140 memset(sa, 0, sizeof *sa);
2141 memcpy(sa, &sin, sizeof sin);
2145 do_set_addr(struct netdev *netdev,
2146 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2149 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2150 make_in4_sockaddr(&ifr.ifr_addr, addr);
2152 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2156 /* Adds 'router' as a default IP gateway. */
2158 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2160 struct in_addr any = { INADDR_ANY };
2164 memset(&rt, 0, sizeof rt);
2165 make_in4_sockaddr(&rt.rt_dst, any);
2166 make_in4_sockaddr(&rt.rt_gateway, router);
2167 make_in4_sockaddr(&rt.rt_genmask, any);
2168 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2169 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2171 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2177 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2180 static const char fn[] = "/proc/net/route";
2185 *netdev_name = NULL;
2186 stream = fopen(fn, "r");
2187 if (stream == NULL) {
2188 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2193 while (fgets(line, sizeof line, stream)) {
2196 ovs_be32 dest, gateway, mask;
2197 int refcnt, metric, mtu;
2198 unsigned int flags, use, window, irtt;
2201 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2203 iface, &dest, &gateway, &flags, &refcnt,
2204 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2206 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2210 if (!(flags & RTF_UP)) {
2211 /* Skip routes that aren't up. */
2215 /* The output of 'dest', 'mask', and 'gateway' were given in
2216 * network byte order, so we don't need need any endian
2217 * conversions here. */
2218 if ((dest & mask) == (host->s_addr & mask)) {
2220 /* The host is directly reachable. */
2221 next_hop->s_addr = 0;
2223 /* To reach the host, we must go through a gateway. */
2224 next_hop->s_addr = gateway;
2226 *netdev_name = xstrdup(iface);
2238 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2241 struct netdev_dev_linux *netdev_dev =
2242 netdev_dev_linux_cast(netdev_get_dev(netdev));
2244 error = netdev_linux_get_drvinfo(netdev_dev);
2246 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2247 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2248 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2254 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2256 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2260 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2261 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2262 * returns 0. Otherwise, it returns a positive errno value; in particular,
2263 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2265 netdev_linux_arp_lookup(const struct netdev *netdev,
2266 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2269 struct sockaddr_in sin;
2272 memset(&r, 0, sizeof r);
2273 memset(&sin, 0, sizeof sin);
2274 sin.sin_family = AF_INET;
2275 sin.sin_addr.s_addr = ip;
2277 memcpy(&r.arp_pa, &sin, sizeof sin);
2278 r.arp_ha.sa_family = ARPHRD_ETHER;
2280 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2281 COVERAGE_INC(netdev_arp_lookup);
2282 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2284 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2285 } else if (retval != ENXIO) {
2286 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2287 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2293 nd_to_iff_flags(enum netdev_flags nd)
2296 if (nd & NETDEV_UP) {
2299 if (nd & NETDEV_PROMISC) {
2306 iff_to_nd_flags(int iff)
2308 enum netdev_flags nd = 0;
2312 if (iff & IFF_PROMISC) {
2313 nd |= NETDEV_PROMISC;
2319 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2320 enum netdev_flags on, enum netdev_flags *old_flagsp)
2322 struct netdev_dev_linux *netdev_dev;
2323 int old_flags, new_flags;
2326 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2327 old_flags = netdev_dev->ifi_flags;
2328 *old_flagsp = iff_to_nd_flags(old_flags);
2329 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2330 if (new_flags != old_flags) {
2331 error = set_flags(netdev, new_flags);
2332 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2338 netdev_linux_change_seq(const struct netdev *netdev)
2340 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2343 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2348 netdev_linux_init, \
2350 netdev_linux_wait, \
2353 netdev_linux_destroy, \
2354 NULL, /* get_config */ \
2355 NULL, /* set_config */ \
2357 netdev_linux_open, \
2358 netdev_linux_close, \
2360 netdev_linux_listen, \
2361 netdev_linux_recv, \
2362 netdev_linux_recv_wait, \
2363 netdev_linux_drain, \
2365 netdev_linux_send, \
2366 netdev_linux_send_wait, \
2368 netdev_linux_set_etheraddr, \
2369 netdev_linux_get_etheraddr, \
2370 netdev_linux_get_mtu, \
2371 netdev_linux_set_mtu, \
2372 netdev_linux_get_ifindex, \
2373 netdev_linux_get_carrier, \
2374 netdev_linux_get_carrier_resets, \
2375 netdev_linux_set_miimon_interval, \
2379 netdev_linux_get_features, \
2380 netdev_linux_set_advertisements, \
2382 netdev_linux_set_policing, \
2383 netdev_linux_get_qos_types, \
2384 netdev_linux_get_qos_capabilities, \
2385 netdev_linux_get_qos, \
2386 netdev_linux_set_qos, \
2387 netdev_linux_get_queue, \
2388 netdev_linux_set_queue, \
2389 netdev_linux_delete_queue, \
2390 netdev_linux_get_queue_stats, \
2391 netdev_linux_dump_queues, \
2392 netdev_linux_dump_queue_stats, \
2394 netdev_linux_get_in4, \
2395 netdev_linux_set_in4, \
2396 netdev_linux_get_in6, \
2397 netdev_linux_add_router, \
2398 netdev_linux_get_next_hop, \
2400 netdev_linux_arp_lookup, \
2402 netdev_linux_update_flags, \
2404 netdev_linux_change_seq \
2407 const struct netdev_class netdev_linux_class =
2410 netdev_linux_create,
2411 netdev_linux_get_stats,
2412 NULL, /* set_stats */
2413 netdev_linux_get_status);
2415 const struct netdev_class netdev_tap_class =
2418 netdev_linux_create_tap,
2419 netdev_tap_get_stats,
2420 NULL, /* set_stats */
2421 netdev_linux_get_status);
2423 const struct netdev_class netdev_internal_class =
2426 netdev_linux_create,
2427 netdev_internal_get_stats,
2428 netdev_vport_set_stats,
2429 netdev_internal_get_status);
2431 /* HTB traffic control class. */
2433 #define HTB_N_QUEUES 0xf000
2437 unsigned int max_rate; /* In bytes/s. */
2441 struct tc_queue tc_queue;
2442 unsigned int min_rate; /* In bytes/s. */
2443 unsigned int max_rate; /* In bytes/s. */
2444 unsigned int burst; /* In bytes. */
2445 unsigned int priority; /* Lower values are higher priorities. */
2449 htb_get__(const struct netdev *netdev)
2451 struct netdev_dev_linux *netdev_dev =
2452 netdev_dev_linux_cast(netdev_get_dev(netdev));
2453 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2457 htb_install__(struct netdev *netdev, uint64_t max_rate)
2459 struct netdev_dev_linux *netdev_dev =
2460 netdev_dev_linux_cast(netdev_get_dev(netdev));
2463 htb = xmalloc(sizeof *htb);
2464 tc_init(&htb->tc, &tc_ops_htb);
2465 htb->max_rate = max_rate;
2467 netdev_dev->tc = &htb->tc;
2470 /* Create an HTB qdisc.
2472 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2474 htb_setup_qdisc__(struct netdev *netdev)
2477 struct tc_htb_glob opt;
2478 struct ofpbuf request;
2479 struct tcmsg *tcmsg;
2481 tc_del_qdisc(netdev);
2483 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2484 NLM_F_EXCL | NLM_F_CREATE, &request);
2488 tcmsg->tcm_handle = tc_make_handle(1, 0);
2489 tcmsg->tcm_parent = TC_H_ROOT;
2491 nl_msg_put_string(&request, TCA_KIND, "htb");
2493 memset(&opt, 0, sizeof opt);
2494 opt.rate2quantum = 10;
2498 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2499 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2500 nl_msg_end_nested(&request, opt_offset);
2502 return tc_transact(&request, NULL);
2505 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2506 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2508 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2509 unsigned int parent, struct htb_class *class)
2512 struct tc_htb_opt opt;
2513 struct ofpbuf request;
2514 struct tcmsg *tcmsg;
2518 error = netdev_get_mtu(netdev, &mtu);
2520 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2521 netdev_get_name(netdev));
2525 memset(&opt, 0, sizeof opt);
2526 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2527 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2528 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2529 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2530 opt.prio = class->priority;
2532 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2536 tcmsg->tcm_handle = handle;
2537 tcmsg->tcm_parent = parent;
2539 nl_msg_put_string(&request, TCA_KIND, "htb");
2540 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2541 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2542 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2543 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2544 nl_msg_end_nested(&request, opt_offset);
2546 error = tc_transact(&request, NULL);
2548 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2549 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2550 netdev_get_name(netdev),
2551 tc_get_major(handle), tc_get_minor(handle),
2552 tc_get_major(parent), tc_get_minor(parent),
2553 class->min_rate, class->max_rate,
2554 class->burst, class->priority, strerror(error));
2559 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2560 * description of them into 'details'. The description complies with the
2561 * specification given in the vswitch database documentation for linux-htb
2564 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2566 static const struct nl_policy tca_htb_policy[] = {
2567 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2568 .min_len = sizeof(struct tc_htb_opt) },
2571 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2572 const struct tc_htb_opt *htb;
2574 if (!nl_parse_nested(nl_options, tca_htb_policy,
2575 attrs, ARRAY_SIZE(tca_htb_policy))) {
2576 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2580 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2581 class->min_rate = htb->rate.rate;
2582 class->max_rate = htb->ceil.rate;
2583 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2584 class->priority = htb->prio;
2589 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2590 struct htb_class *options,
2591 struct netdev_queue_stats *stats)
2593 struct nlattr *nl_options;
2594 unsigned int handle;
2597 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2598 if (!error && queue_id) {
2599 unsigned int major = tc_get_major(handle);
2600 unsigned int minor = tc_get_minor(handle);
2601 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2602 *queue_id = minor - 1;
2607 if (!error && options) {
2608 error = htb_parse_tca_options__(nl_options, options);
2614 htb_parse_qdisc_details__(struct netdev *netdev,
2615 const struct shash *details, struct htb_class *hc)
2617 const char *max_rate_s;
2619 max_rate_s = shash_find_data(details, "max-rate");
2620 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2621 if (!hc->max_rate) {
2624 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2625 hc->max_rate = netdev_features_to_bps(current) / 8;
2627 hc->min_rate = hc->max_rate;
2633 htb_parse_class_details__(struct netdev *netdev,
2634 const struct shash *details, struct htb_class *hc)
2636 const struct htb *htb = htb_get__(netdev);
2637 const char *min_rate_s = shash_find_data(details, "min-rate");
2638 const char *max_rate_s = shash_find_data(details, "max-rate");
2639 const char *burst_s = shash_find_data(details, "burst");
2640 const char *priority_s = shash_find_data(details, "priority");
2643 error = netdev_get_mtu(netdev, &mtu);
2645 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2646 netdev_get_name(netdev));
2650 /* HTB requires at least an mtu sized min-rate to send any traffic even
2651 * on uncongested links. */
2652 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2653 hc->min_rate = MAX(hc->min_rate, mtu);
2654 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2657 hc->max_rate = (max_rate_s
2658 ? strtoull(max_rate_s, NULL, 10) / 8
2660 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2661 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2665 * According to hints in the documentation that I've read, it is important
2666 * that 'burst' be at least as big as the largest frame that might be
2667 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2668 * but having it a bit too small is a problem. Since netdev_get_mtu()
2669 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2670 * the MTU. We actually add 64, instead of 14, as a guard against
2671 * additional headers get tacked on somewhere that we're not aware of. */
2672 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2673 hc->burst = MAX(hc->burst, mtu + 64);
2676 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2682 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2683 unsigned int parent, struct htb_class *options,
2684 struct netdev_queue_stats *stats)
2686 struct ofpbuf *reply;
2689 error = tc_query_class(netdev, handle, parent, &reply);
2691 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2692 ofpbuf_delete(reply);
2698 htb_tc_install(struct netdev *netdev, const struct shash *details)
2702 error = htb_setup_qdisc__(netdev);
2704 struct htb_class hc;
2706 htb_parse_qdisc_details__(netdev, details, &hc);
2707 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2708 tc_make_handle(1, 0), &hc);
2710 htb_install__(netdev, hc.max_rate);
2716 static struct htb_class *
2717 htb_class_cast__(const struct tc_queue *queue)
2719 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2723 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2724 const struct htb_class *hc)
2726 struct htb *htb = htb_get__(netdev);
2727 size_t hash = hash_int(queue_id, 0);
2728 struct tc_queue *queue;
2729 struct htb_class *hcp;
2731 queue = tc_find_queue__(netdev, queue_id, hash);
2733 hcp = htb_class_cast__(queue);
2735 hcp = xmalloc(sizeof *hcp);
2736 queue = &hcp->tc_queue;
2737 queue->queue_id = queue_id;
2738 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2741 hcp->min_rate = hc->min_rate;
2742 hcp->max_rate = hc->max_rate;
2743 hcp->burst = hc->burst;
2744 hcp->priority = hc->priority;
2748 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2751 struct nl_dump dump;
2752 struct htb_class hc;
2754 /* Get qdisc options. */
2756 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2757 htb_install__(netdev, hc.max_rate);
2760 if (!start_queue_dump(netdev, &dump)) {
2763 while (nl_dump_next(&dump, &msg)) {
2764 unsigned int queue_id;
2766 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2767 htb_update_queue__(netdev, queue_id, &hc);
2770 nl_dump_done(&dump);
2776 htb_tc_destroy(struct tc *tc)
2778 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2779 struct htb_class *hc, *next;
2781 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2782 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2790 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2792 const struct htb *htb = htb_get__(netdev);
2793 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2798 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2800 struct htb_class hc;
2803 htb_parse_qdisc_details__(netdev, details, &hc);
2804 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2805 tc_make_handle(1, 0), &hc);
2807 htb_get__(netdev)->max_rate = hc.max_rate;
2813 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2814 const struct tc_queue *queue, struct shash *details)
2816 const struct htb_class *hc = htb_class_cast__(queue);
2818 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2819 if (hc->min_rate != hc->max_rate) {
2820 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2822 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2824 shash_add(details, "priority", xasprintf("%u", hc->priority));
2830 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2831 const struct shash *details)
2833 struct htb_class hc;
2836 error = htb_parse_class_details__(netdev, details, &hc);
2841 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2842 tc_make_handle(1, 0xfffe), &hc);
2847 htb_update_queue__(netdev, queue_id, &hc);
2852 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2854 struct htb_class *hc = htb_class_cast__(queue);
2855 struct htb *htb = htb_get__(netdev);
2858 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2860 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2867 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2868 struct netdev_queue_stats *stats)
2870 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2871 tc_make_handle(1, 0xfffe), NULL, stats);
2875 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2876 const struct ofpbuf *nlmsg,
2877 netdev_dump_queue_stats_cb *cb, void *aux)
2879 struct netdev_queue_stats stats;
2880 unsigned int handle, major, minor;
2883 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2888 major = tc_get_major(handle);
2889 minor = tc_get_minor(handle);
2890 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2891 (*cb)(minor - 1, &stats, aux);
2896 static const struct tc_ops tc_ops_htb = {
2897 "htb", /* linux_name */
2898 "linux-htb", /* ovs_name */
2899 HTB_N_QUEUES, /* n_queues */
2908 htb_class_get_stats,
2909 htb_class_dump_stats
2912 /* "linux-hfsc" traffic control class. */
2914 #define HFSC_N_QUEUES 0xf000
2922 struct tc_queue tc_queue;
2927 static struct hfsc *
2928 hfsc_get__(const struct netdev *netdev)
2930 struct netdev_dev_linux *netdev_dev;
2931 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2932 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2935 static struct hfsc_class *
2936 hfsc_class_cast__(const struct tc_queue *queue)
2938 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2942 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2944 struct netdev_dev_linux * netdev_dev;
2947 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2948 hfsc = xmalloc(sizeof *hfsc);
2949 tc_init(&hfsc->tc, &tc_ops_hfsc);
2950 hfsc->max_rate = max_rate;
2951 netdev_dev->tc = &hfsc->tc;
2955 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2956 const struct hfsc_class *hc)
2960 struct hfsc_class *hcp;
2961 struct tc_queue *queue;
2963 hfsc = hfsc_get__(netdev);
2964 hash = hash_int(queue_id, 0);
2966 queue = tc_find_queue__(netdev, queue_id, hash);
2968 hcp = hfsc_class_cast__(queue);
2970 hcp = xmalloc(sizeof *hcp);
2971 queue = &hcp->tc_queue;
2972 queue->queue_id = queue_id;
2973 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2976 hcp->min_rate = hc->min_rate;
2977 hcp->max_rate = hc->max_rate;
2981 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2983 const struct tc_service_curve *rsc, *fsc, *usc;
2984 static const struct nl_policy tca_hfsc_policy[] = {
2986 .type = NL_A_UNSPEC,
2988 .min_len = sizeof(struct tc_service_curve),
2991 .type = NL_A_UNSPEC,
2993 .min_len = sizeof(struct tc_service_curve),
2996 .type = NL_A_UNSPEC,
2998 .min_len = sizeof(struct tc_service_curve),
3001 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3003 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3004 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3005 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3009 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3010 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3011 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3013 if (rsc->m1 != 0 || rsc->d != 0 ||
3014 fsc->m1 != 0 || fsc->d != 0 ||
3015 usc->m1 != 0 || usc->d != 0) {
3016 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3017 "Non-linear service curves are not supported.");
3021 if (rsc->m2 != fsc->m2) {
3022 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3023 "Real-time service curves are not supported ");
3027 if (rsc->m2 > usc->m2) {
3028 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3029 "Min-rate service curve is greater than "
3030 "the max-rate service curve.");
3034 class->min_rate = fsc->m2;
3035 class->max_rate = usc->m2;
3040 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3041 struct hfsc_class *options,
3042 struct netdev_queue_stats *stats)
3045 unsigned int handle;
3046 struct nlattr *nl_options;
3048 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3054 unsigned int major, minor;
3056 major = tc_get_major(handle);
3057 minor = tc_get_minor(handle);
3058 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3059 *queue_id = minor - 1;
3066 error = hfsc_parse_tca_options__(nl_options, options);
3073 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3074 unsigned int parent, struct hfsc_class *options,
3075 struct netdev_queue_stats *stats)
3078 struct ofpbuf *reply;
3080 error = tc_query_class(netdev, handle, parent, &reply);
3085 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3086 ofpbuf_delete(reply);
3091 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3092 struct hfsc_class *class)
3095 const char *max_rate_s;
3097 max_rate_s = shash_find_data(details, "max-rate");
3098 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3103 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3104 max_rate = netdev_features_to_bps(current) / 8;
3107 class->min_rate = max_rate;
3108 class->max_rate = max_rate;
3112 hfsc_parse_class_details__(struct netdev *netdev,
3113 const struct shash *details,
3114 struct hfsc_class * class)
3116 const struct hfsc *hfsc;
3117 uint32_t min_rate, max_rate;
3118 const char *min_rate_s, *max_rate_s;
3120 hfsc = hfsc_get__(netdev);
3121 min_rate_s = shash_find_data(details, "min-rate");
3122 max_rate_s = shash_find_data(details, "max-rate");
3124 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3125 min_rate = MAX(min_rate, 1);
3126 min_rate = MIN(min_rate, hfsc->max_rate);
3128 max_rate = (max_rate_s
3129 ? strtoull(max_rate_s, NULL, 10) / 8
3131 max_rate = MAX(max_rate, min_rate);
3132 max_rate = MIN(max_rate, hfsc->max_rate);
3134 class->min_rate = min_rate;
3135 class->max_rate = max_rate;
3140 /* Create an HFSC qdisc.
3142 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3144 hfsc_setup_qdisc__(struct netdev * netdev)
3146 struct tcmsg *tcmsg;
3147 struct ofpbuf request;
3148 struct tc_hfsc_qopt opt;
3150 tc_del_qdisc(netdev);
3152 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3153 NLM_F_EXCL | NLM_F_CREATE, &request);
3159 tcmsg->tcm_handle = tc_make_handle(1, 0);
3160 tcmsg->tcm_parent = TC_H_ROOT;
3162 memset(&opt, 0, sizeof opt);
3165 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3166 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3168 return tc_transact(&request, NULL);
3171 /* Create an HFSC class.
3173 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3174 * sc rate <min_rate> ul rate <max_rate>" */
3176 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3177 unsigned int parent, struct hfsc_class *class)
3181 struct tcmsg *tcmsg;
3182 struct ofpbuf request;
3183 struct tc_service_curve min, max;
3185 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3191 tcmsg->tcm_handle = handle;
3192 tcmsg->tcm_parent = parent;
3196 min.m2 = class->min_rate;
3200 max.m2 = class->max_rate;
3202 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3203 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3204 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3205 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3206 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3207 nl_msg_end_nested(&request, opt_offset);
3209 error = tc_transact(&request, NULL);
3211 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3212 "min-rate %ubps, max-rate %ubps (%s)",
3213 netdev_get_name(netdev),
3214 tc_get_major(handle), tc_get_minor(handle),
3215 tc_get_major(parent), tc_get_minor(parent),
3216 class->min_rate, class->max_rate, strerror(error));
3223 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3226 struct hfsc_class class;
3228 error = hfsc_setup_qdisc__(netdev);
3234 hfsc_parse_qdisc_details__(netdev, details, &class);
3235 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3236 tc_make_handle(1, 0), &class);
3242 hfsc_install__(netdev, class.max_rate);
3247 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3250 struct nl_dump dump;
3251 struct hfsc_class hc;
3254 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3255 hfsc_install__(netdev, hc.max_rate);
3257 if (!start_queue_dump(netdev, &dump)) {
3261 while (nl_dump_next(&dump, &msg)) {
3262 unsigned int queue_id;
3264 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3265 hfsc_update_queue__(netdev, queue_id, &hc);
3269 nl_dump_done(&dump);
3274 hfsc_tc_destroy(struct tc *tc)
3277 struct hfsc_class *hc, *next;
3279 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3281 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3282 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3291 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3293 const struct hfsc *hfsc;
3294 hfsc = hfsc_get__(netdev);
3295 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3300 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3303 struct hfsc_class class;
3305 hfsc_parse_qdisc_details__(netdev, details, &class);
3306 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3307 tc_make_handle(1, 0), &class);
3310 hfsc_get__(netdev)->max_rate = class.max_rate;
3317 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3318 const struct tc_queue *queue, struct shash *details)
3320 const struct hfsc_class *hc;
3322 hc = hfsc_class_cast__(queue);
3323 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3324 if (hc->min_rate != hc->max_rate) {
3325 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3331 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3332 const struct shash *details)
3335 struct hfsc_class class;
3337 error = hfsc_parse_class_details__(netdev, details, &class);
3342 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3343 tc_make_handle(1, 0xfffe), &class);
3348 hfsc_update_queue__(netdev, queue_id, &class);
3353 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3357 struct hfsc_class *hc;
3359 hc = hfsc_class_cast__(queue);
3360 hfsc = hfsc_get__(netdev);
3362 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3364 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3371 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3372 struct netdev_queue_stats *stats)
3374 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3375 tc_make_handle(1, 0xfffe), NULL, stats);
3379 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3380 const struct ofpbuf *nlmsg,
3381 netdev_dump_queue_stats_cb *cb, void *aux)
3383 struct netdev_queue_stats stats;
3384 unsigned int handle, major, minor;
3387 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3392 major = tc_get_major(handle);
3393 minor = tc_get_minor(handle);
3394 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3395 (*cb)(minor - 1, &stats, aux);
3400 static const struct tc_ops tc_ops_hfsc = {
3401 "hfsc", /* linux_name */
3402 "linux-hfsc", /* ovs_name */
3403 HFSC_N_QUEUES, /* n_queues */
3404 hfsc_tc_install, /* tc_install */
3405 hfsc_tc_load, /* tc_load */
3406 hfsc_tc_destroy, /* tc_destroy */
3407 hfsc_qdisc_get, /* qdisc_get */
3408 hfsc_qdisc_set, /* qdisc_set */
3409 hfsc_class_get, /* class_get */
3410 hfsc_class_set, /* class_set */
3411 hfsc_class_delete, /* class_delete */
3412 hfsc_class_get_stats, /* class_get_stats */
3413 hfsc_class_dump_stats /* class_dump_stats */
3416 /* "linux-default" traffic control class.
3418 * This class represents the default, unnamed Linux qdisc. It corresponds to
3419 * the "" (empty string) QoS type in the OVS database. */
3422 default_install__(struct netdev *netdev)
3424 struct netdev_dev_linux *netdev_dev =
3425 netdev_dev_linux_cast(netdev_get_dev(netdev));
3426 static struct tc *tc;
3429 tc = xmalloc(sizeof *tc);
3430 tc_init(tc, &tc_ops_default);
3432 netdev_dev->tc = tc;
3436 default_tc_install(struct netdev *netdev,
3437 const struct shash *details OVS_UNUSED)
3439 default_install__(netdev);
3444 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3446 default_install__(netdev);
3450 static const struct tc_ops tc_ops_default = {
3451 NULL, /* linux_name */
3456 NULL, /* tc_destroy */
3457 NULL, /* qdisc_get */
3458 NULL, /* qdisc_set */
3459 NULL, /* class_get */
3460 NULL, /* class_set */
3461 NULL, /* class_delete */
3462 NULL, /* class_get_stats */
3463 NULL /* class_dump_stats */
3466 /* "linux-other" traffic control class.
3471 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3473 struct netdev_dev_linux *netdev_dev =
3474 netdev_dev_linux_cast(netdev_get_dev(netdev));
3475 static struct tc *tc;
3478 tc = xmalloc(sizeof *tc);
3479 tc_init(tc, &tc_ops_other);
3481 netdev_dev->tc = tc;
3485 static const struct tc_ops tc_ops_other = {
3486 NULL, /* linux_name */
3487 "linux-other", /* ovs_name */
3489 NULL, /* tc_install */
3491 NULL, /* tc_destroy */
3492 NULL, /* qdisc_get */
3493 NULL, /* qdisc_set */
3494 NULL, /* class_get */
3495 NULL, /* class_set */
3496 NULL, /* class_delete */
3497 NULL, /* class_get_stats */
3498 NULL /* class_dump_stats */
3501 /* Traffic control. */
3503 /* Number of kernel "tc" ticks per second. */
3504 static double ticks_per_s;
3506 /* Number of kernel "jiffies" per second. This is used for the purpose of
3507 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3508 * one jiffy's worth of data.
3510 * There are two possibilities here:
3512 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3513 * approximate range of 100 to 1024. That means that we really need to
3514 * make sure that the qdisc can buffer that much data.
3516 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3517 * has finely granular timers and there's no need to fudge additional room
3518 * for buffers. (There's no extra effort needed to implement that: the
3519 * large 'buffer_hz' is used as a divisor, so practically any number will
3520 * come out as 0 in the division. Small integer results in the case of
3521 * really high dividends won't have any real effect anyhow.)
3523 static unsigned int buffer_hz;
3525 /* Returns tc handle 'major':'minor'. */
3527 tc_make_handle(unsigned int major, unsigned int minor)
3529 return TC_H_MAKE(major << 16, minor);
3532 /* Returns the major number from 'handle'. */
3534 tc_get_major(unsigned int handle)
3536 return TC_H_MAJ(handle) >> 16;
3539 /* Returns the minor number from 'handle'. */
3541 tc_get_minor(unsigned int handle)
3543 return TC_H_MIN(handle);
3546 static struct tcmsg *
3547 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3548 struct ofpbuf *request)
3550 struct tcmsg *tcmsg;
3554 error = get_ifindex(netdev, &ifindex);
3559 ofpbuf_init(request, 512);
3560 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3561 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3562 tcmsg->tcm_family = AF_UNSPEC;
3563 tcmsg->tcm_ifindex = ifindex;
3564 /* Caller should fill in tcmsg->tcm_handle. */
3565 /* Caller should fill in tcmsg->tcm_parent. */
3571 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3573 int error = nl_sock_transact(rtnl_sock, request, replyp);
3574 ofpbuf_uninit(request);
3578 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3579 * policing configuration.
3581 * This function is equivalent to running the following when 'add' is true:
3582 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3584 * This function is equivalent to running the following when 'add' is false:
3585 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3587 * The configuration and stats may be seen with the following command:
3588 * /sbin/tc -s qdisc show dev <devname>
3590 * Returns 0 if successful, otherwise a positive errno value.
3593 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3595 struct ofpbuf request;
3596 struct tcmsg *tcmsg;
3598 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3599 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3601 tcmsg = tc_make_request(netdev, type, flags, &request);
3605 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3606 tcmsg->tcm_parent = TC_H_INGRESS;
3607 nl_msg_put_string(&request, TCA_KIND, "ingress");
3608 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3610 error = tc_transact(&request, NULL);
3612 /* If we're deleting the qdisc, don't worry about some of the
3613 * error conditions. */
3614 if (!add && (error == ENOENT || error == EINVAL)) {
3623 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3626 * This function is equivalent to running:
3627 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3628 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3631 * The configuration and stats may be seen with the following command:
3632 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3634 * Returns 0 if successful, otherwise a positive errno value.
3637 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3639 struct tc_police tc_police;
3640 struct ofpbuf request;
3641 struct tcmsg *tcmsg;
3642 size_t basic_offset;
3643 size_t police_offset;
3647 memset(&tc_police, 0, sizeof tc_police);
3648 tc_police.action = TC_POLICE_SHOT;
3649 tc_police.mtu = mtu;
3650 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3651 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3652 kbits_burst * 1024);
3654 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3655 NLM_F_EXCL | NLM_F_CREATE, &request);
3659 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3660 tcmsg->tcm_info = tc_make_handle(49,
3661 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3663 nl_msg_put_string(&request, TCA_KIND, "basic");
3664 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3665 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3666 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3667 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3668 nl_msg_end_nested(&request, police_offset);
3669 nl_msg_end_nested(&request, basic_offset);
3671 error = tc_transact(&request, NULL);
3682 /* The values in psched are not individually very meaningful, but they are
3683 * important. The tables below show some values seen in the wild.
3687 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3688 * (Before that, there are hints that it was 1000000000.)
3690 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3694 * -----------------------------------
3695 * [1] 000c8000 000f4240 000f4240 00000064
3696 * [2] 000003e8 00000400 000f4240 3b9aca00
3697 * [3] 000003e8 00000400 000f4240 3b9aca00
3698 * [4] 000003e8 00000400 000f4240 00000064
3699 * [5] 000003e8 00000040 000f4240 3b9aca00
3700 * [6] 000003e8 00000040 000f4240 000000f9
3702 * a b c d ticks_per_s buffer_hz
3703 * ------- --------- ---------- ------------- ----------- -------------
3704 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3705 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3706 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3707 * [4] 1,000 1,024 1,000,000 100 976,562 100
3708 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3709 * [6] 1,000 64 1,000,000 249 15,625,000 249
3711 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3712 * [2] 2.6.26-1-686-bigmem from Debian lenny
3713 * [3] 2.6.26-2-sparc64 from Debian lenny
3714 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3715 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3716 * [6] 2.6.34 from kernel.org on KVM
3718 static const char fn[] = "/proc/net/psched";
3719 unsigned int a, b, c, d;
3725 stream = fopen(fn, "r");
3727 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3731 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3732 VLOG_WARN("%s: read failed", fn);
3736 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3740 VLOG_WARN("%s: invalid scheduler parameters", fn);
3744 ticks_per_s = (double) a * c / b;
3748 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3751 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3754 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3755 * rate of 'rate' bytes per second. */
3757 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3762 return (rate * ticks) / ticks_per_s;
3765 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3766 * rate of 'rate' bytes per second. */
3768 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3773 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3776 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3777 * a transmission rate of 'rate' bytes per second. */
3779 tc_buffer_per_jiffy(unsigned int rate)
3784 return rate / buffer_hz;
3787 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3788 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3789 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3790 * stores NULL into it if it is absent.
3792 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3795 * Returns 0 if successful, otherwise a positive errno value. */
3797 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3798 struct nlattr **options)
3800 static const struct nl_policy tca_policy[] = {
3801 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3802 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3804 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3806 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3807 tca_policy, ta, ARRAY_SIZE(ta))) {
3808 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3813 *kind = nl_attr_get_string(ta[TCA_KIND]);
3817 *options = ta[TCA_OPTIONS];
3832 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3833 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3834 * into '*options', and its queue statistics into '*stats'. Any of the output
3835 * arguments may be null.
3837 * Returns 0 if successful, otherwise a positive errno value. */
3839 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3840 struct nlattr **options, struct netdev_queue_stats *stats)
3842 static const struct nl_policy tca_policy[] = {
3843 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3844 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3846 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3848 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3849 tca_policy, ta, ARRAY_SIZE(ta))) {
3850 VLOG_WARN_RL(&rl, "failed to parse class message");
3855 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3856 *handlep = tc->tcm_handle;
3860 *options = ta[TCA_OPTIONS];
3864 const struct gnet_stats_queue *gsq;
3865 struct gnet_stats_basic gsb;
3867 static const struct nl_policy stats_policy[] = {
3868 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3869 .min_len = sizeof gsb },
3870 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3871 .min_len = sizeof *gsq },
3873 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3875 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3876 sa, ARRAY_SIZE(sa))) {
3877 VLOG_WARN_RL(&rl, "failed to parse class stats");
3881 /* Alignment issues screw up the length of struct gnet_stats_basic on
3882 * some arch/bitsize combinations. Newer versions of Linux have a
3883 * struct gnet_stats_basic_packed, but we can't depend on that. The
3884 * easiest thing to do is just to make a copy. */
3885 memset(&gsb, 0, sizeof gsb);
3886 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3887 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3888 stats->tx_bytes = gsb.bytes;
3889 stats->tx_packets = gsb.packets;
3891 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3892 stats->tx_errors = gsq->drops;
3902 memset(stats, 0, sizeof *stats);
3907 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3910 tc_query_class(const struct netdev *netdev,
3911 unsigned int handle, unsigned int parent,
3912 struct ofpbuf **replyp)
3914 struct ofpbuf request;
3915 struct tcmsg *tcmsg;
3918 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3922 tcmsg->tcm_handle = handle;
3923 tcmsg->tcm_parent = parent;
3925 error = tc_transact(&request, replyp);
3927 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3928 netdev_get_name(netdev),
3929 tc_get_major(handle), tc_get_minor(handle),
3930 tc_get_major(parent), tc_get_minor(parent),
3936 /* Equivalent to "tc class del dev <name> handle <handle>". */
3938 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3940 struct ofpbuf request;
3941 struct tcmsg *tcmsg;
3944 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3948 tcmsg->tcm_handle = handle;
3949 tcmsg->tcm_parent = 0;
3951 error = tc_transact(&request, NULL);
3953 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3954 netdev_get_name(netdev),
3955 tc_get_major(handle), tc_get_minor(handle),
3961 /* Equivalent to "tc qdisc del dev <name> root". */
3963 tc_del_qdisc(struct netdev *netdev)
3965 struct netdev_dev_linux *netdev_dev =
3966 netdev_dev_linux_cast(netdev_get_dev(netdev));
3967 struct ofpbuf request;
3968 struct tcmsg *tcmsg;
3971 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3975 tcmsg->tcm_handle = tc_make_handle(1, 0);
3976 tcmsg->tcm_parent = TC_H_ROOT;
3978 error = tc_transact(&request, NULL);
3979 if (error == EINVAL) {
3980 /* EINVAL probably means that the default qdisc was in use, in which
3981 * case we've accomplished our purpose. */
3984 if (!error && netdev_dev->tc) {
3985 if (netdev_dev->tc->ops->tc_destroy) {
3986 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3988 netdev_dev->tc = NULL;
3993 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3994 * kernel to determine what they are. Returns 0 if successful, otherwise a
3995 * positive errno value. */
3997 tc_query_qdisc(const struct netdev *netdev)
3999 struct netdev_dev_linux *netdev_dev =
4000 netdev_dev_linux_cast(netdev_get_dev(netdev));
4001 struct ofpbuf request, *qdisc;
4002 const struct tc_ops *ops;
4003 struct tcmsg *tcmsg;
4007 if (netdev_dev->tc) {
4011 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4012 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4013 * 2.6.35 without that fix backported to it.
4015 * To avoid the OOPS, we must not make a request that would attempt to dump
4016 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4017 * few others. There are a few ways that I can see to do this, but most of
4018 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4019 * technique chosen here is to assume that any non-default qdisc that we
4020 * create will have a class with handle 1:0. The built-in qdiscs only have
4021 * a class with handle 0:0.
4023 * We could check for Linux 2.6.35+ and use a more straightforward method
4025 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4029 tcmsg->tcm_handle = tc_make_handle(1, 0);
4030 tcmsg->tcm_parent = 0;
4032 /* Figure out what tc class to instantiate. */
4033 error = tc_transact(&request, &qdisc);
4037 error = tc_parse_qdisc(qdisc, &kind, NULL);
4039 ops = &tc_ops_other;
4041 ops = tc_lookup_linux_name(kind);
4043 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4044 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4046 ops = &tc_ops_other;
4049 } else if (error == ENOENT) {
4050 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4051 * other entity that doesn't have a handle 1:0. We will assume
4052 * that it's the system default qdisc. */
4053 ops = &tc_ops_default;
4056 /* Who knows? Maybe the device got deleted. */
4057 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4058 netdev_get_name(netdev), strerror(error));
4059 ops = &tc_ops_other;
4062 /* Instantiate it. */
4063 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4064 assert((load_error == 0) == (netdev_dev->tc != NULL));
4065 ofpbuf_delete(qdisc);
4067 return error ? error : load_error;
4070 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4071 approximate the time to transmit packets of various lengths. For an MTU of
4072 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4073 represents two possible packet lengths; for a MTU of 513 through 1024, four
4074 possible lengths; and so on.
4076 Returns, for the specified 'mtu', the number of bits that packet lengths
4077 need to be shifted right to fit within such a 256-entry table. */
4079 tc_calc_cell_log(unsigned int mtu)
4084 mtu = ETH_PAYLOAD_MAX;
4086 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4088 for (cell_log = 0; mtu >= 256; cell_log++) {
4095 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4098 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4100 memset(rate, 0, sizeof *rate);
4101 rate->cell_log = tc_calc_cell_log(mtu);
4102 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4103 /* rate->cell_align = 0; */ /* distro headers. */
4104 rate->mpu = ETH_TOTAL_MIN;
4108 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4109 * attribute of the specified "type".
4111 * See tc_calc_cell_log() above for a description of "rtab"s. */
4113 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4118 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4119 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4120 unsigned packet_size = (i + 1) << rate->cell_log;
4121 if (packet_size < rate->mpu) {
4122 packet_size = rate->mpu;
4124 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4128 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4129 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4130 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4133 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4135 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4136 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4139 /* Linux-only functions declared in netdev-linux.h */
4141 /* Returns a fd for an AF_INET socket or a negative errno value. */
4143 netdev_linux_get_af_inet_sock(void)
4145 int error = netdev_linux_init();
4146 return error ? -error : af_inet_sock;
4149 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4150 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4152 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4153 const char *flag_name, bool enable)
4155 const char *netdev_name = netdev_get_name(netdev);
4156 struct ethtool_value evalue;
4160 memset(&evalue, 0, sizeof evalue);
4161 error = netdev_linux_do_ethtool(netdev_name,
4162 (struct ethtool_cmd *)&evalue,
4163 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4168 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4169 error = netdev_linux_do_ethtool(netdev_name,
4170 (struct ethtool_cmd *)&evalue,
4171 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4176 memset(&evalue, 0, sizeof evalue);
4177 error = netdev_linux_do_ethtool(netdev_name,
4178 (struct ethtool_cmd *)&evalue,
4179 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4184 if (new_flags != evalue.data) {
4185 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4186 "device %s failed", enable ? "enable" : "disable",
4187 flag_name, netdev_name);
4194 /* Utility functions. */
4196 /* Copies 'src' into 'dst', performing format conversion in the process. */
4198 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4199 const struct rtnl_link_stats *src)
4201 dst->rx_packets = src->rx_packets;
4202 dst->tx_packets = src->tx_packets;
4203 dst->rx_bytes = src->rx_bytes;
4204 dst->tx_bytes = src->tx_bytes;
4205 dst->rx_errors = src->rx_errors;
4206 dst->tx_errors = src->tx_errors;
4207 dst->rx_dropped = src->rx_dropped;
4208 dst->tx_dropped = src->tx_dropped;
4209 dst->multicast = src->multicast;
4210 dst->collisions = src->collisions;
4211 dst->rx_length_errors = src->rx_length_errors;
4212 dst->rx_over_errors = src->rx_over_errors;
4213 dst->rx_crc_errors = src->rx_crc_errors;
4214 dst->rx_frame_errors = src->rx_frame_errors;
4215 dst->rx_fifo_errors = src->rx_fifo_errors;
4216 dst->rx_missed_errors = src->rx_missed_errors;
4217 dst->tx_aborted_errors = src->tx_aborted_errors;
4218 dst->tx_carrier_errors = src->tx_carrier_errors;
4219 dst->tx_fifo_errors = src->tx_fifo_errors;
4220 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4221 dst->tx_window_errors = src->tx_window_errors;
4225 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4227 /* Policy for RTNLGRP_LINK messages.
4229 * There are *many* more fields in these messages, but currently we only
4230 * care about these fields. */
4231 static const struct nl_policy rtnlgrp_link_policy[] = {
4232 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4233 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4234 .min_len = sizeof(struct rtnl_link_stats) },
4237 struct ofpbuf request;
4238 struct ofpbuf *reply;
4239 struct ifinfomsg *ifi;
4240 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4243 ofpbuf_init(&request, 0);
4244 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4245 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4246 ifi->ifi_family = PF_UNSPEC;
4247 ifi->ifi_index = ifindex;
4248 error = nl_sock_transact(rtnl_sock, &request, &reply);
4249 ofpbuf_uninit(&request);
4254 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4255 rtnlgrp_link_policy,
4256 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4257 ofpbuf_delete(reply);
4261 if (!attrs[IFLA_STATS]) {
4262 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4263 ofpbuf_delete(reply);
4267 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4269 ofpbuf_delete(reply);
4275 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4277 static const char fn[] = "/proc/net/dev";
4282 stream = fopen(fn, "r");
4284 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4289 while (fgets(line, sizeof line, stream)) {
4292 #define X64 "%"SCNu64
4295 X64 X64 X64 X64 X64 X64 X64 "%*u"
4296 X64 X64 X64 X64 X64 X64 X64 "%*u",
4302 &stats->rx_fifo_errors,
4303 &stats->rx_frame_errors,
4309 &stats->tx_fifo_errors,
4311 &stats->tx_carrier_errors) != 15) {
4312 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4313 } else if (!strcmp(devname, netdev_name)) {
4314 stats->rx_length_errors = UINT64_MAX;
4315 stats->rx_over_errors = UINT64_MAX;
4316 stats->rx_crc_errors = UINT64_MAX;
4317 stats->rx_missed_errors = UINT64_MAX;
4318 stats->tx_aborted_errors = UINT64_MAX;
4319 stats->tx_heartbeat_errors = UINT64_MAX;
4320 stats->tx_window_errors = UINT64_MAX;
4326 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4332 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4338 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4341 *flags = ifr.ifr_flags;
4347 set_flags(struct netdev *netdev, unsigned int flags)
4351 ifr.ifr_flags = flags;
4352 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4357 do_get_ifindex(const char *netdev_name)
4361 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4362 COVERAGE_INC(netdev_get_ifindex);
4363 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4364 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4365 netdev_name, strerror(errno));
4368 return ifr.ifr_ifindex;
4372 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4374 struct netdev_dev_linux *netdev_dev =
4375 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4377 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4378 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4382 netdev_dev->cache_valid |= VALID_IFINDEX;
4383 netdev_dev->ifindex = ifindex;
4385 *ifindexp = netdev_dev->ifindex;
4390 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4395 memset(&ifr, 0, sizeof ifr);
4396 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4397 COVERAGE_INC(netdev_get_hwaddr);
4398 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4399 /* ENODEV probably means that a vif disappeared asynchronously and
4400 * hasn't been removed from the database yet, so reduce the log level
4401 * to INFO for that case. */
4402 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4403 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4404 netdev_name, strerror(errno));
4407 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4408 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4409 VLOG_WARN("%s device has unknown hardware address family %d",
4410 netdev_name, hwaddr_family);
4412 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4417 set_etheraddr(const char *netdev_name,
4418 const uint8_t mac[ETH_ADDR_LEN])
4422 memset(&ifr, 0, sizeof ifr);
4423 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4424 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4425 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4426 COVERAGE_INC(netdev_set_hwaddr);
4427 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4428 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4429 netdev_name, strerror(errno));
4436 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4437 int cmd, const char *cmd_name)
4441 memset(&ifr, 0, sizeof ifr);
4442 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4443 ifr.ifr_data = (caddr_t) ecmd;
4446 COVERAGE_INC(netdev_ethtool);
4447 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4450 if (errno != EOPNOTSUPP) {
4451 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4452 "failed: %s", cmd_name, name, strerror(errno));
4454 /* The device doesn't support this operation. That's pretty
4455 * common, so there's no point in logging anything. */
4462 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4463 const char *cmd_name)
4465 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4466 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4467 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4475 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4476 int cmd, const char *cmd_name)
4481 ifr.ifr_addr.sa_family = AF_INET;
4482 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4484 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4485 *ip = sin->sin_addr;
4490 /* Returns an AF_PACKET raw socket or a negative errno value. */
4492 af_packet_sock(void)
4494 static int sock = INT_MIN;
4496 if (sock == INT_MIN) {
4497 sock = socket(AF_PACKET, SOCK_RAW, 0);
4499 set_nonblocking(sock);
4502 VLOG_ERR("failed to create packet socket: %s", strerror(errno));