2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct smap *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct smap *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct smap *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct smap *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tap_state tap;
399 struct netdev_linux {
400 struct netdev netdev;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* A Netlink routing socket that is not subscribed to any multicast groups. */
408 static struct nl_sock *rtnl_sock;
410 /* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
414 static int netdev_linux_init(void);
416 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
417 int cmd, const char *cmd_name);
418 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
420 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
422 static int get_flags(const struct netdev_dev *, unsigned int *flags);
423 static int set_flags(struct netdev *, unsigned int flags);
424 static int do_get_ifindex(const char *netdev_name);
425 static int get_ifindex(const struct netdev *, int *ifindexp);
426 static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
430 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
431 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
433 static int af_packet_sock(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->init == netdev_linux_init;
443 static struct netdev_dev_linux *
444 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 assert(is_netdev_linux_class(netdev_class));
449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 assert(is_netdev_linux_class(netdev_class));
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 /* Create rtnetlink socket. */
476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
487 netdev_linux_run(void)
489 rtnetlink_link_run();
490 netdev_linux_miimon_run();
494 netdev_linux_wait(void)
496 rtnetlink_link_wait();
497 netdev_linux_miimon_wait();
501 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
506 if (netdev_dev->cache_valid & VALID_DRVINFO) {
510 COVERAGE_INC(netdev_get_ethtool);
511 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
512 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
513 (struct ethtool_cmd *)&netdev_dev->drvinfo,
517 netdev_dev->cache_valid |= VALID_DRVINFO;
523 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
524 unsigned int ifi_flags,
528 if (!dev->change_seq) {
532 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
533 dev->carrier_resets++;
535 dev->ifi_flags = ifi_flags;
537 dev->cache_valid &= mask;
541 netdev_dev_linux_update(struct netdev_dev_linux *dev,
542 const struct rtnetlink_link_change *change)
544 if (change->nlmsg_type == RTM_NEWLINK) {
546 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
548 /* Update netdev from rtnl-change msg. */
550 dev->mtu = change->mtu;
551 dev->cache_valid |= VALID_MTU;
552 dev->netdev_mtu_error = 0;
555 if (!eth_addr_is_zero(change->addr)) {
556 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
557 dev->cache_valid |= VALID_ETHERADDR;
558 dev->ether_addr_error = 0;
561 dev->ifindex = change->ifi_index;
562 dev->cache_valid |= VALID_IFINDEX;
563 dev->get_ifindex_error = 0;
566 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
571 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
572 void *aux OVS_UNUSED)
574 struct netdev_dev_linux *dev;
576 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
578 const struct netdev_class *netdev_class =
579 netdev_dev_get_class(base_dev);
581 if (is_netdev_linux_class(netdev_class)) {
582 dev = netdev_dev_linux_cast(base_dev);
583 netdev_dev_linux_update(dev, change);
587 struct shash device_shash;
588 struct shash_node *node;
590 shash_init(&device_shash);
591 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
592 SHASH_FOR_EACH (node, &device_shash) {
597 get_flags(&dev->netdev_dev, &flags);
598 netdev_dev_linux_changed(dev, flags, 0);
600 shash_destroy(&device_shash);
605 cache_notifier_ref(void)
607 if (!cache_notifier_refcount) {
608 assert(!netdev_linux_cache_notifier);
610 netdev_linux_cache_notifier =
611 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
613 if (!netdev_linux_cache_notifier) {
617 cache_notifier_refcount++;
623 cache_notifier_unref(void)
625 assert(cache_notifier_refcount > 0);
626 if (!--cache_notifier_refcount) {
627 assert(netdev_linux_cache_notifier);
628 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
629 netdev_linux_cache_notifier = NULL;
633 /* Creates system and internal devices. */
635 netdev_linux_create(const struct netdev_class *class, const char *name,
636 struct netdev_dev **netdev_devp)
638 struct netdev_dev_linux *netdev_dev;
641 error = cache_notifier_ref();
646 netdev_dev = xzalloc(sizeof *netdev_dev);
647 netdev_dev->change_seq = 1;
648 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
649 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
651 *netdev_devp = &netdev_dev->netdev_dev;
655 /* For most types of netdevs we open the device for each call of
656 * netdev_open(). However, this is not the case with tap devices,
657 * since it is only possible to open the device once. In this
658 * situation we share a single file descriptor, and consequently
659 * buffers, across all readers. Therefore once data is read it will
660 * be unavailable to other reads for tap devices. */
662 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
663 const char *name, struct netdev_dev **netdev_devp)
665 struct netdev_dev_linux *netdev_dev;
666 struct tap_state *state;
667 static const char tap_dev[] = "/dev/net/tun";
671 netdev_dev = xzalloc(sizeof *netdev_dev);
672 state = &netdev_dev->state.tap;
674 error = cache_notifier_ref();
679 /* Open tap device. */
680 state->fd = open(tap_dev, O_RDWR);
683 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
684 goto error_unref_notifier;
687 /* Create tap device. */
688 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
689 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
690 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
691 VLOG_WARN("%s: creating tap device failed: %s", name,
694 goto error_unref_notifier;
697 /* Make non-blocking. */
698 error = set_nonblocking(state->fd);
700 goto error_unref_notifier;
703 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
704 *netdev_devp = &netdev_dev->netdev_dev;
707 error_unref_notifier:
708 cache_notifier_unref();
715 destroy_tap(struct netdev_dev_linux *netdev_dev)
717 struct tap_state *state = &netdev_dev->state.tap;
719 if (state->fd >= 0) {
724 /* Destroys the netdev device 'netdev_dev_'. */
726 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
728 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
729 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
731 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
732 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
735 if (class == &netdev_tap_class) {
736 destroy_tap(netdev_dev);
740 cache_notifier_unref();
744 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
746 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
747 struct netdev_linux *netdev;
748 enum netdev_flags flags;
751 /* Allocate network device. */
752 netdev = xzalloc(sizeof *netdev);
754 netdev_init(&netdev->netdev, netdev_dev_);
756 /* Verify that the device really exists, by attempting to read its flags.
757 * (The flags might be cached, in which case this won't actually do an
760 * Don't do this for "internal" netdevs, though, because those have to be
761 * created as netdev objects before they exist in the kernel, because
762 * creating them in the kernel happens by passing a netdev object to
763 * dpif_port_add(). */
764 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
765 error = netdev_get_flags(&netdev->netdev, &flags);
766 if (error == ENODEV) {
771 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
772 !netdev_dev->state.tap.opened) {
774 /* We assume that the first user of the tap device is the primary user
775 * and give them the tap FD. Subsequent users probably just expect
776 * this to be a system device so open it normally to avoid send/receive
777 * directions appearing to be reversed. */
778 netdev->fd = netdev_dev->state.tap.fd;
779 netdev_dev->state.tap.opened = true;
782 *netdevp = &netdev->netdev;
786 netdev_uninit(&netdev->netdev, true);
790 /* Closes and destroys 'netdev'. */
792 netdev_linux_close(struct netdev *netdev_)
794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
803 netdev_linux_listen(struct netdev *netdev_)
805 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
806 struct sockaddr_ll sll;
811 if (netdev->fd >= 0) {
815 /* Create file descriptor. */
816 fd = socket(PF_PACKET, SOCK_RAW, 0);
819 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
823 /* Set non-blocking mode. */
824 error = set_nonblocking(fd);
829 /* Get ethernet device index. */
830 error = get_ifindex(&netdev->netdev, &ifindex);
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
839 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
840 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
843 netdev_get_name(netdev_), strerror(error));
858 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
862 if (netdev->fd < 0) {
863 /* Device is not listening. */
870 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
871 ? read(netdev->fd, data, size)
872 : recv(netdev->fd, data, size, MSG_TRUNC));
874 return retval <= size ? retval : -EMSGSIZE;
875 } else if (errno != EINTR) {
876 if (errno != EAGAIN) {
877 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
878 strerror(errno), netdev_get_name(netdev_));
885 /* Registers with the poll loop to wake up from the next call to poll_block()
886 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
888 netdev_linux_recv_wait(struct netdev *netdev_)
890 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
891 if (netdev->fd >= 0) {
892 poll_fd_wait(netdev->fd, POLLIN);
896 /* Discards all packets waiting to be received from 'netdev'. */
898 netdev_linux_drain(struct netdev *netdev_)
900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
901 if (netdev->fd < 0) {
903 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
905 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
906 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
910 drain_fd(netdev->fd, ifr.ifr_qlen);
913 return drain_rcvbuf(netdev->fd);
917 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
918 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
919 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
920 * the packet is too big or too small to transmit on the device.
922 * The caller retains ownership of 'buffer' in all cases.
924 * The kernel maintains a packet transmission queue, so the caller is not
925 * expected to do additional queuing of packets. */
927 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
929 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
933 if (netdev->fd < 0) {
934 /* Use our AF_PACKET socket to send to this device. */
935 struct sockaddr_ll sll;
942 sock = af_packet_sock();
947 error = get_ifindex(netdev_, &ifindex);
952 /* We don't bother setting most fields in sockaddr_ll because the
953 * kernel ignores them for SOCK_RAW. */
954 memset(&sll, 0, sizeof sll);
955 sll.sll_family = AF_PACKET;
956 sll.sll_ifindex = ifindex;
958 iov.iov_base = CONST_CAST(void *, data);
962 msg.msg_namelen = sizeof sll;
965 msg.msg_control = NULL;
966 msg.msg_controllen = 0;
969 retval = sendmsg(sock, &msg, 0);
971 /* Use the netdev's own fd to send to this device. This is
972 * essential for tap devices, because packets sent to a tap device
973 * with an AF_PACKET socket will loop back to be *received* again
974 * on the tap device. */
975 retval = write(netdev->fd, data, size);
979 /* The Linux AF_PACKET implementation never blocks waiting for room
980 * for packets, instead returning ENOBUFS. Translate this into
981 * EAGAIN for the caller. */
982 if (errno == ENOBUFS) {
984 } else if (errno == EINTR) {
986 } else if (errno != EAGAIN) {
987 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
988 netdev_get_name(netdev_), strerror(errno));
991 } else if (retval != size) {
992 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
993 "%zu) on %s", retval, size, netdev_get_name(netdev_));
1001 /* Registers with the poll loop to wake up from the next call to poll_block()
1002 * when the packet transmission queue has sufficient room to transmit a packet
1003 * with netdev_send().
1005 * The kernel maintains a packet transmission queue, so the client is not
1006 * expected to do additional queuing of packets. Thus, this function is
1007 * unlikely to ever be used. It is included for completeness. */
1009 netdev_linux_send_wait(struct netdev *netdev_)
1011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1012 if (netdev->fd < 0) {
1013 /* Nothing to do. */
1014 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1015 poll_fd_wait(netdev->fd, POLLOUT);
1017 /* TAP device always accepts packets.*/
1018 poll_immediate_wake();
1022 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1023 * otherwise a positive errno value. */
1025 netdev_linux_set_etheraddr(struct netdev *netdev_,
1026 const uint8_t mac[ETH_ADDR_LEN])
1028 struct netdev_dev_linux *netdev_dev =
1029 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1031 bool up_again = false;
1033 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1034 if (netdev_dev->ether_addr_error) {
1035 return netdev_dev->ether_addr_error;
1037 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1040 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1043 /* Tap devices must be brought down before setting the address. */
1044 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1045 enum netdev_flags flags;
1047 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1048 netdev_turn_flags_off(netdev_, NETDEV_UP, false);
1052 error = set_etheraddr(netdev_get_name(netdev_), mac);
1053 if (!error || error == ENODEV) {
1054 netdev_dev->ether_addr_error = error;
1055 netdev_dev->cache_valid |= VALID_ETHERADDR;
1057 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1062 netdev_turn_flags_on(netdev_, NETDEV_UP, false);
1068 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1070 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1071 uint8_t mac[ETH_ADDR_LEN])
1073 struct netdev_dev_linux *netdev_dev =
1074 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1076 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1077 int error = get_etheraddr(netdev_get_name(netdev_),
1078 netdev_dev->etheraddr);
1080 netdev_dev->ether_addr_error = error;
1081 netdev_dev->cache_valid |= VALID_ETHERADDR;
1084 if (!netdev_dev->ether_addr_error) {
1085 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1088 return netdev_dev->ether_addr_error;
1091 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1092 * in bytes, not including the hardware header; thus, this is typically 1500
1093 * bytes for Ethernet devices. */
1095 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1097 struct netdev_dev_linux *netdev_dev =
1098 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1099 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1103 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1104 SIOCGIFMTU, "SIOCGIFMTU");
1106 netdev_dev->netdev_mtu_error = error;
1107 netdev_dev->mtu = ifr.ifr_mtu;
1108 netdev_dev->cache_valid |= VALID_MTU;
1111 if (!netdev_dev->netdev_mtu_error) {
1112 *mtup = netdev_dev->mtu;
1114 return netdev_dev->netdev_mtu_error;
1117 /* Sets the maximum size of transmitted (MTU) for given device using linux
1118 * networking ioctl interface.
1121 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1123 struct netdev_dev_linux *netdev_dev =
1124 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1128 if (netdev_dev->cache_valid & VALID_MTU) {
1129 if (netdev_dev->netdev_mtu_error) {
1130 return netdev_dev->netdev_mtu_error;
1132 if (netdev_dev->mtu == mtu) {
1135 netdev_dev->cache_valid &= ~VALID_MTU;
1138 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1139 SIOCSIFMTU, "SIOCSIFMTU");
1140 if (!error || error == ENODEV) {
1141 netdev_dev->netdev_mtu_error = error;
1142 netdev_dev->mtu = ifr.ifr_mtu;
1143 netdev_dev->cache_valid |= VALID_MTU;
1148 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1149 * On failure, returns a negative errno value. */
1151 netdev_linux_get_ifindex(const struct netdev *netdev)
1155 error = get_ifindex(netdev, &ifindex);
1156 return error ? -error : ifindex;
1160 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1162 struct netdev_dev_linux *netdev_dev =
1163 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1165 if (netdev_dev->miimon_interval > 0) {
1166 *carrier = netdev_dev->miimon;
1168 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1174 static long long int
1175 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1177 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1181 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1182 struct mii_ioctl_data *data)
1187 memset(&ifr, 0, sizeof ifr);
1188 memcpy(&ifr.ifr_data, data, sizeof *data);
1189 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1190 memcpy(data, &ifr.ifr_data, sizeof *data);
1196 netdev_linux_get_miimon(const char *name, bool *miimon)
1198 struct mii_ioctl_data data;
1203 memset(&data, 0, sizeof data);
1204 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1206 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1207 data.reg_num = MII_BMSR;
1208 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1212 *miimon = !!(data.val_out & BMSR_LSTATUS);
1214 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1217 struct ethtool_cmd ecmd;
1219 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1222 COVERAGE_INC(netdev_get_ethtool);
1223 memset(&ecmd, 0, sizeof ecmd);
1224 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1227 struct ethtool_value eval;
1229 memcpy(&eval, &ecmd, sizeof eval);
1230 *miimon = !!eval.data;
1232 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1240 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1241 long long int interval)
1243 struct netdev_dev_linux *netdev_dev;
1245 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1247 interval = interval > 0 ? MAX(interval, 100) : 0;
1248 if (netdev_dev->miimon_interval != interval) {
1249 netdev_dev->miimon_interval = interval;
1250 timer_set_expired(&netdev_dev->miimon_timer);
1257 netdev_linux_miimon_run(void)
1259 struct shash device_shash;
1260 struct shash_node *node;
1262 shash_init(&device_shash);
1263 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1264 SHASH_FOR_EACH (node, &device_shash) {
1265 struct netdev_dev_linux *dev = node->data;
1268 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1272 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1273 if (miimon != dev->miimon) {
1274 dev->miimon = miimon;
1275 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1278 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1281 shash_destroy(&device_shash);
1285 netdev_linux_miimon_wait(void)
1287 struct shash device_shash;
1288 struct shash_node *node;
1290 shash_init(&device_shash);
1291 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1292 SHASH_FOR_EACH (node, &device_shash) {
1293 struct netdev_dev_linux *dev = node->data;
1295 if (dev->miimon_interval > 0) {
1296 timer_wait(&dev->miimon_timer);
1299 shash_destroy(&device_shash);
1302 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1303 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1306 check_for_working_netlink_stats(void)
1308 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1309 * preferable, so if that works, we'll use it. */
1310 int ifindex = do_get_ifindex("lo");
1312 VLOG_WARN("failed to get ifindex for lo, "
1313 "obtaining netdev stats from proc");
1316 struct netdev_stats stats;
1317 int error = get_stats_via_netlink(ifindex, &stats);
1319 VLOG_DBG("obtaining netdev stats via rtnetlink");
1322 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1323 "via proc (you are probably running a pre-2.6.19 "
1324 "kernel)", strerror(error));
1331 swap_uint64(uint64_t *a, uint64_t *b)
1339 get_stats_via_vport(const struct netdev *netdev_,
1340 struct netdev_stats *stats)
1342 struct netdev_dev_linux *netdev_dev =
1343 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1345 if (!netdev_dev->vport_stats_error ||
1346 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1349 error = netdev_vport_get_stats(netdev_, stats);
1350 if (error && error != ENOENT) {
1351 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1352 "(%s)", netdev_get_name(netdev_), strerror(error));
1354 netdev_dev->vport_stats_error = error;
1355 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1360 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1361 struct netdev_stats *stats)
1363 static int use_netlink_stats = -1;
1366 if (use_netlink_stats < 0) {
1367 use_netlink_stats = check_for_working_netlink_stats();
1370 if (use_netlink_stats) {
1373 error = get_ifindex(netdev_, &ifindex);
1375 error = get_stats_via_netlink(ifindex, stats);
1378 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1382 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1383 netdev_get_name(netdev_), error);
1389 /* Retrieves current device stats for 'netdev-linux'. */
1391 netdev_linux_get_stats(const struct netdev *netdev_,
1392 struct netdev_stats *stats)
1394 struct netdev_dev_linux *netdev_dev =
1395 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1396 struct netdev_stats dev_stats;
1399 get_stats_via_vport(netdev_, stats);
1401 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1404 if (netdev_dev->vport_stats_error) {
1411 if (netdev_dev->vport_stats_error) {
1412 /* stats not available from OVS then use ioctl stats. */
1415 stats->rx_errors += dev_stats.rx_errors;
1416 stats->tx_errors += dev_stats.tx_errors;
1417 stats->rx_dropped += dev_stats.rx_dropped;
1418 stats->tx_dropped += dev_stats.tx_dropped;
1419 stats->multicast += dev_stats.multicast;
1420 stats->collisions += dev_stats.collisions;
1421 stats->rx_length_errors += dev_stats.rx_length_errors;
1422 stats->rx_over_errors += dev_stats.rx_over_errors;
1423 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1424 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1425 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1426 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1427 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1428 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1429 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1430 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1431 stats->tx_window_errors += dev_stats.tx_window_errors;
1436 /* Retrieves current device stats for 'netdev-tap' netdev or
1437 * netdev-internal. */
1439 netdev_tap_get_stats(const struct netdev *netdev_,
1440 struct netdev_stats *stats)
1442 struct netdev_dev_linux *netdev_dev =
1443 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1444 struct netdev_stats dev_stats;
1447 get_stats_via_vport(netdev_, stats);
1449 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1451 if (netdev_dev->vport_stats_error) {
1458 /* If this port is an internal port then the transmit and receive stats
1459 * will appear to be swapped relative to the other ports since we are the
1460 * one sending the data, not a remote computer. For consistency, we swap
1461 * them back here. This does not apply if we are getting stats from the
1462 * vport layer because it always tracks stats from the perspective of the
1464 if (netdev_dev->vport_stats_error) {
1466 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1467 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1468 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1469 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1470 stats->rx_length_errors = 0;
1471 stats->rx_over_errors = 0;
1472 stats->rx_crc_errors = 0;
1473 stats->rx_frame_errors = 0;
1474 stats->rx_fifo_errors = 0;
1475 stats->rx_missed_errors = 0;
1476 stats->tx_aborted_errors = 0;
1477 stats->tx_carrier_errors = 0;
1478 stats->tx_fifo_errors = 0;
1479 stats->tx_heartbeat_errors = 0;
1480 stats->tx_window_errors = 0;
1482 stats->rx_dropped += dev_stats.tx_dropped;
1483 stats->tx_dropped += dev_stats.rx_dropped;
1485 stats->rx_errors += dev_stats.tx_errors;
1486 stats->tx_errors += dev_stats.rx_errors;
1488 stats->multicast += dev_stats.multicast;
1489 stats->collisions += dev_stats.collisions;
1495 netdev_internal_get_stats(const struct netdev *netdev_,
1496 struct netdev_stats *stats)
1498 struct netdev_dev_linux *netdev_dev =
1499 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1501 get_stats_via_vport(netdev_, stats);
1502 return netdev_dev->vport_stats_error;
1506 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1508 struct ethtool_cmd ecmd;
1512 if (netdev_dev->cache_valid & VALID_FEATURES) {
1516 COVERAGE_INC(netdev_get_ethtool);
1517 memset(&ecmd, 0, sizeof ecmd);
1518 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1519 ETHTOOL_GSET, "ETHTOOL_GSET");
1524 /* Supported features. */
1525 netdev_dev->supported = 0;
1526 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1527 netdev_dev->supported |= NETDEV_F_10MB_HD;
1529 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1530 netdev_dev->supported |= NETDEV_F_10MB_FD;
1532 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1533 netdev_dev->supported |= NETDEV_F_100MB_HD;
1535 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1536 netdev_dev->supported |= NETDEV_F_100MB_FD;
1538 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1539 netdev_dev->supported |= NETDEV_F_1GB_HD;
1541 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1542 netdev_dev->supported |= NETDEV_F_1GB_FD;
1544 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1545 netdev_dev->supported |= NETDEV_F_10GB_FD;
1547 if (ecmd.supported & SUPPORTED_TP) {
1548 netdev_dev->supported |= NETDEV_F_COPPER;
1550 if (ecmd.supported & SUPPORTED_FIBRE) {
1551 netdev_dev->supported |= NETDEV_F_FIBER;
1553 if (ecmd.supported & SUPPORTED_Autoneg) {
1554 netdev_dev->supported |= NETDEV_F_AUTONEG;
1556 if (ecmd.supported & SUPPORTED_Pause) {
1557 netdev_dev->supported |= NETDEV_F_PAUSE;
1559 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1560 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1563 /* Advertised features. */
1564 netdev_dev->advertised = 0;
1565 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1566 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1568 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1569 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1571 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1572 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1574 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1575 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1577 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1578 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1580 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1581 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1583 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1584 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1586 if (ecmd.advertising & ADVERTISED_TP) {
1587 netdev_dev->advertised |= NETDEV_F_COPPER;
1589 if (ecmd.advertising & ADVERTISED_FIBRE) {
1590 netdev_dev->advertised |= NETDEV_F_FIBER;
1592 if (ecmd.advertising & ADVERTISED_Autoneg) {
1593 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1595 if (ecmd.advertising & ADVERTISED_Pause) {
1596 netdev_dev->advertised |= NETDEV_F_PAUSE;
1598 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1599 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1602 /* Current settings. */
1604 if (speed == SPEED_10) {
1605 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1606 } else if (speed == SPEED_100) {
1607 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1608 } else if (speed == SPEED_1000) {
1609 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1610 } else if (speed == SPEED_10000) {
1611 netdev_dev->current = NETDEV_F_10GB_FD;
1612 } else if (speed == 40000) {
1613 netdev_dev->current = NETDEV_F_40GB_FD;
1614 } else if (speed == 100000) {
1615 netdev_dev->current = NETDEV_F_100GB_FD;
1616 } else if (speed == 1000000) {
1617 netdev_dev->current = NETDEV_F_1TB_FD;
1619 netdev_dev->current = 0;
1622 if (ecmd.port == PORT_TP) {
1623 netdev_dev->current |= NETDEV_F_COPPER;
1624 } else if (ecmd.port == PORT_FIBRE) {
1625 netdev_dev->current |= NETDEV_F_FIBER;
1629 netdev_dev->current |= NETDEV_F_AUTONEG;
1632 /* Peer advertisements. */
1633 netdev_dev->peer = 0; /* XXX */
1636 netdev_dev->cache_valid |= VALID_FEATURES;
1637 netdev_dev->get_features_error = error;
1640 /* Stores the features supported by 'netdev' into each of '*current',
1641 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1642 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1645 netdev_linux_get_features(const struct netdev *netdev_,
1646 enum netdev_features *current,
1647 enum netdev_features *advertised,
1648 enum netdev_features *supported,
1649 enum netdev_features *peer)
1651 struct netdev_dev_linux *netdev_dev =
1652 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1654 netdev_linux_read_features(netdev_dev);
1656 if (!netdev_dev->get_features_error) {
1657 *current = netdev_dev->current;
1658 *advertised = netdev_dev->advertised;
1659 *supported = netdev_dev->supported;
1660 *peer = netdev_dev->peer;
1662 return netdev_dev->get_features_error;
1665 /* Set the features advertised by 'netdev' to 'advertise'. */
1667 netdev_linux_set_advertisements(struct netdev *netdev,
1668 enum netdev_features advertise)
1670 struct ethtool_cmd ecmd;
1673 COVERAGE_INC(netdev_get_ethtool);
1674 memset(&ecmd, 0, sizeof ecmd);
1675 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1676 ETHTOOL_GSET, "ETHTOOL_GSET");
1681 ecmd.advertising = 0;
1682 if (advertise & NETDEV_F_10MB_HD) {
1683 ecmd.advertising |= ADVERTISED_10baseT_Half;
1685 if (advertise & NETDEV_F_10MB_FD) {
1686 ecmd.advertising |= ADVERTISED_10baseT_Full;
1688 if (advertise & NETDEV_F_100MB_HD) {
1689 ecmd.advertising |= ADVERTISED_100baseT_Half;
1691 if (advertise & NETDEV_F_100MB_FD) {
1692 ecmd.advertising |= ADVERTISED_100baseT_Full;
1694 if (advertise & NETDEV_F_1GB_HD) {
1695 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1697 if (advertise & NETDEV_F_1GB_FD) {
1698 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1700 if (advertise & NETDEV_F_10GB_FD) {
1701 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1703 if (advertise & NETDEV_F_COPPER) {
1704 ecmd.advertising |= ADVERTISED_TP;
1706 if (advertise & NETDEV_F_FIBER) {
1707 ecmd.advertising |= ADVERTISED_FIBRE;
1709 if (advertise & NETDEV_F_AUTONEG) {
1710 ecmd.advertising |= ADVERTISED_Autoneg;
1712 if (advertise & NETDEV_F_PAUSE) {
1713 ecmd.advertising |= ADVERTISED_Pause;
1715 if (advertise & NETDEV_F_PAUSE_ASYM) {
1716 ecmd.advertising |= ADVERTISED_Asym_Pause;
1718 COVERAGE_INC(netdev_set_ethtool);
1719 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1720 ETHTOOL_SSET, "ETHTOOL_SSET");
1723 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1724 * successful, otherwise a positive errno value. */
1726 netdev_linux_set_policing(struct netdev *netdev,
1727 uint32_t kbits_rate, uint32_t kbits_burst)
1729 struct netdev_dev_linux *netdev_dev =
1730 netdev_dev_linux_cast(netdev_get_dev(netdev));
1731 const char *netdev_name = netdev_get_name(netdev);
1735 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1736 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1737 : kbits_burst); /* Stick with user-specified value. */
1739 if (netdev_dev->cache_valid & VALID_POLICING) {
1740 if (netdev_dev->netdev_policing_error) {
1741 return netdev_dev->netdev_policing_error;
1744 if (netdev_dev->kbits_rate == kbits_rate &&
1745 netdev_dev->kbits_burst == kbits_burst) {
1746 /* Assume that settings haven't changed since we last set them. */
1749 netdev_dev->cache_valid &= ~VALID_POLICING;
1752 COVERAGE_INC(netdev_set_policing);
1753 /* Remove any existing ingress qdisc. */
1754 error = tc_add_del_ingress_qdisc(netdev, false);
1756 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1757 netdev_name, strerror(error));
1762 error = tc_add_del_ingress_qdisc(netdev, true);
1764 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1765 netdev_name, strerror(error));
1769 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1771 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1772 netdev_name, strerror(error));
1777 netdev_dev->kbits_rate = kbits_rate;
1778 netdev_dev->kbits_burst = kbits_burst;
1781 if (!error || error == ENODEV) {
1782 netdev_dev->netdev_policing_error = error;
1783 netdev_dev->cache_valid |= VALID_POLICING;
1789 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1792 const struct tc_ops **opsp;
1794 for (opsp = tcs; *opsp != NULL; opsp++) {
1795 const struct tc_ops *ops = *opsp;
1796 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1797 sset_add(types, ops->ovs_name);
1803 static const struct tc_ops *
1804 tc_lookup_ovs_name(const char *name)
1806 const struct tc_ops **opsp;
1808 for (opsp = tcs; *opsp != NULL; opsp++) {
1809 const struct tc_ops *ops = *opsp;
1810 if (!strcmp(name, ops->ovs_name)) {
1817 static const struct tc_ops *
1818 tc_lookup_linux_name(const char *name)
1820 const struct tc_ops **opsp;
1822 for (opsp = tcs; *opsp != NULL; opsp++) {
1823 const struct tc_ops *ops = *opsp;
1824 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1831 static struct tc_queue *
1832 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1835 struct netdev_dev_linux *netdev_dev =
1836 netdev_dev_linux_cast(netdev_get_dev(netdev));
1837 struct tc_queue *queue;
1839 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1840 if (queue->queue_id == queue_id) {
1847 static struct tc_queue *
1848 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1850 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1854 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1856 struct netdev_qos_capabilities *caps)
1858 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1862 caps->n_queues = ops->n_queues;
1867 netdev_linux_get_qos(const struct netdev *netdev,
1868 const char **typep, struct smap *details)
1870 struct netdev_dev_linux *netdev_dev =
1871 netdev_dev_linux_cast(netdev_get_dev(netdev));
1874 error = tc_query_qdisc(netdev);
1879 *typep = netdev_dev->tc->ops->ovs_name;
1880 return (netdev_dev->tc->ops->qdisc_get
1881 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1886 netdev_linux_set_qos(struct netdev *netdev,
1887 const char *type, const struct smap *details)
1889 struct netdev_dev_linux *netdev_dev =
1890 netdev_dev_linux_cast(netdev_get_dev(netdev));
1891 const struct tc_ops *new_ops;
1894 new_ops = tc_lookup_ovs_name(type);
1895 if (!new_ops || !new_ops->tc_install) {
1899 error = tc_query_qdisc(netdev);
1904 if (new_ops == netdev_dev->tc->ops) {
1905 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1907 /* Delete existing qdisc. */
1908 error = tc_del_qdisc(netdev);
1912 assert(netdev_dev->tc == NULL);
1914 /* Install new qdisc. */
1915 error = new_ops->tc_install(netdev, details);
1916 assert((error == 0) == (netdev_dev->tc != NULL));
1923 netdev_linux_get_queue(const struct netdev *netdev,
1924 unsigned int queue_id, struct smap *details)
1926 struct netdev_dev_linux *netdev_dev =
1927 netdev_dev_linux_cast(netdev_get_dev(netdev));
1930 error = tc_query_qdisc(netdev);
1934 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1936 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1942 netdev_linux_set_queue(struct netdev *netdev,
1943 unsigned int queue_id, const struct smap *details)
1945 struct netdev_dev_linux *netdev_dev =
1946 netdev_dev_linux_cast(netdev_get_dev(netdev));
1949 error = tc_query_qdisc(netdev);
1952 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1953 || !netdev_dev->tc->ops->class_set) {
1957 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1961 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1963 struct netdev_dev_linux *netdev_dev =
1964 netdev_dev_linux_cast(netdev_get_dev(netdev));
1967 error = tc_query_qdisc(netdev);
1970 } else if (!netdev_dev->tc->ops->class_delete) {
1973 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1975 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1981 netdev_linux_get_queue_stats(const struct netdev *netdev,
1982 unsigned int queue_id,
1983 struct netdev_queue_stats *stats)
1985 struct netdev_dev_linux *netdev_dev =
1986 netdev_dev_linux_cast(netdev_get_dev(netdev));
1989 error = tc_query_qdisc(netdev);
1992 } else if (!netdev_dev->tc->ops->class_get_stats) {
1995 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1997 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2003 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2005 struct ofpbuf request;
2006 struct tcmsg *tcmsg;
2008 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
2012 tcmsg->tcm_parent = 0;
2013 nl_dump_start(dump, rtnl_sock, &request);
2014 ofpbuf_uninit(&request);
2019 netdev_linux_dump_queues(const struct netdev *netdev,
2020 netdev_dump_queues_cb *cb, void *aux)
2022 struct netdev_dev_linux *netdev_dev =
2023 netdev_dev_linux_cast(netdev_get_dev(netdev));
2024 struct tc_queue *queue, *next_queue;
2025 struct smap details;
2029 error = tc_query_qdisc(netdev);
2032 } else if (!netdev_dev->tc->ops->class_get) {
2037 smap_init(&details);
2038 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2039 &netdev_dev->tc->queues) {
2040 smap_clear(&details);
2042 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2044 (*cb)(queue->queue_id, &details, aux);
2049 smap_destroy(&details);
2055 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2056 netdev_dump_queue_stats_cb *cb, void *aux)
2058 struct netdev_dev_linux *netdev_dev =
2059 netdev_dev_linux_cast(netdev_get_dev(netdev));
2060 struct nl_dump dump;
2065 error = tc_query_qdisc(netdev);
2068 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2073 if (!start_queue_dump(netdev, &dump)) {
2076 while (nl_dump_next(&dump, &msg)) {
2077 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2083 error = nl_dump_done(&dump);
2084 return error ? error : last_error;
2088 netdev_linux_get_in4(const struct netdev *netdev_,
2089 struct in_addr *address, struct in_addr *netmask)
2091 struct netdev_dev_linux *netdev_dev =
2092 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2094 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2097 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2098 SIOCGIFADDR, "SIOCGIFADDR");
2103 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2104 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2109 netdev_dev->cache_valid |= VALID_IN4;
2111 *address = netdev_dev->address;
2112 *netmask = netdev_dev->netmask;
2113 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2117 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2118 struct in_addr netmask)
2120 struct netdev_dev_linux *netdev_dev =
2121 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2124 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2126 netdev_dev->cache_valid |= VALID_IN4;
2127 netdev_dev->address = address;
2128 netdev_dev->netmask = netmask;
2129 if (address.s_addr != INADDR_ANY) {
2130 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2131 "SIOCSIFNETMASK", netmask);
2138 parse_if_inet6_line(const char *line,
2139 struct in6_addr *in6, char ifname[16 + 1])
2141 uint8_t *s6 = in6->s6_addr;
2142 #define X8 "%2"SCNx8
2144 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2145 "%*x %*x %*x %*x %16s\n",
2146 &s6[0], &s6[1], &s6[2], &s6[3],
2147 &s6[4], &s6[5], &s6[6], &s6[7],
2148 &s6[8], &s6[9], &s6[10], &s6[11],
2149 &s6[12], &s6[13], &s6[14], &s6[15],
2153 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2154 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2156 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2158 struct netdev_dev_linux *netdev_dev =
2159 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2160 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2164 netdev_dev->in6 = in6addr_any;
2166 file = fopen("/proc/net/if_inet6", "r");
2168 const char *name = netdev_get_name(netdev_);
2169 while (fgets(line, sizeof line, file)) {
2170 struct in6_addr in6_tmp;
2171 char ifname[16 + 1];
2172 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2173 && !strcmp(name, ifname))
2175 netdev_dev->in6 = in6_tmp;
2181 netdev_dev->cache_valid |= VALID_IN6;
2183 *in6 = netdev_dev->in6;
2188 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2190 struct sockaddr_in sin;
2191 memset(&sin, 0, sizeof sin);
2192 sin.sin_family = AF_INET;
2193 sin.sin_addr = addr;
2196 memset(sa, 0, sizeof *sa);
2197 memcpy(sa, &sin, sizeof sin);
2201 do_set_addr(struct netdev *netdev,
2202 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2205 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2206 make_in4_sockaddr(&ifr.ifr_addr, addr);
2208 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2212 /* Adds 'router' as a default IP gateway. */
2214 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2216 struct in_addr any = { INADDR_ANY };
2220 memset(&rt, 0, sizeof rt);
2221 make_in4_sockaddr(&rt.rt_dst, any);
2222 make_in4_sockaddr(&rt.rt_gateway, router);
2223 make_in4_sockaddr(&rt.rt_genmask, any);
2224 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2225 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2227 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2233 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2236 static const char fn[] = "/proc/net/route";
2241 *netdev_name = NULL;
2242 stream = fopen(fn, "r");
2243 if (stream == NULL) {
2244 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2249 while (fgets(line, sizeof line, stream)) {
2252 ovs_be32 dest, gateway, mask;
2253 int refcnt, metric, mtu;
2254 unsigned int flags, use, window, irtt;
2257 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2259 iface, &dest, &gateway, &flags, &refcnt,
2260 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2262 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2266 if (!(flags & RTF_UP)) {
2267 /* Skip routes that aren't up. */
2271 /* The output of 'dest', 'mask', and 'gateway' were given in
2272 * network byte order, so we don't need need any endian
2273 * conversions here. */
2274 if ((dest & mask) == (host->s_addr & mask)) {
2276 /* The host is directly reachable. */
2277 next_hop->s_addr = 0;
2279 /* To reach the host, we must go through a gateway. */
2280 next_hop->s_addr = gateway;
2282 *netdev_name = xstrdup(iface);
2294 netdev_linux_get_drv_info(const struct netdev *netdev, struct smap *smap)
2297 struct netdev_dev_linux *netdev_dev =
2298 netdev_dev_linux_cast(netdev_get_dev(netdev));
2300 error = netdev_linux_get_drvinfo(netdev_dev);
2302 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2303 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2304 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2310 netdev_internal_get_drv_info(const struct netdev *netdev OVS_UNUSED,
2313 smap_add(smap, "driver_name", "openvswitch");
2317 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2318 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2319 * returns 0. Otherwise, it returns a positive errno value; in particular,
2320 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2322 netdev_linux_arp_lookup(const struct netdev *netdev,
2323 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2326 struct sockaddr_in sin;
2329 memset(&r, 0, sizeof r);
2330 memset(&sin, 0, sizeof sin);
2331 sin.sin_family = AF_INET;
2332 sin.sin_addr.s_addr = ip;
2334 memcpy(&r.arp_pa, &sin, sizeof sin);
2335 r.arp_ha.sa_family = ARPHRD_ETHER;
2337 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2338 COVERAGE_INC(netdev_arp_lookup);
2339 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2341 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2342 } else if (retval != ENXIO) {
2343 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2344 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2350 nd_to_iff_flags(enum netdev_flags nd)
2353 if (nd & NETDEV_UP) {
2356 if (nd & NETDEV_PROMISC) {
2363 iff_to_nd_flags(int iff)
2365 enum netdev_flags nd = 0;
2369 if (iff & IFF_PROMISC) {
2370 nd |= NETDEV_PROMISC;
2376 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2377 enum netdev_flags on, enum netdev_flags *old_flagsp)
2379 struct netdev_dev_linux *netdev_dev;
2380 int old_flags, new_flags;
2383 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2384 old_flags = netdev_dev->ifi_flags;
2385 *old_flagsp = iff_to_nd_flags(old_flags);
2386 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2387 if (new_flags != old_flags) {
2388 error = set_flags(netdev, new_flags);
2389 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2395 netdev_linux_change_seq(const struct netdev *netdev)
2397 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2400 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2401 GET_FEATURES, GET_STATUS) \
2405 netdev_linux_init, \
2407 netdev_linux_wait, \
2410 netdev_linux_destroy, \
2411 NULL, /* get_config */ \
2412 NULL, /* set_config */ \
2414 netdev_linux_open, \
2415 netdev_linux_close, \
2417 netdev_linux_listen, \
2418 netdev_linux_recv, \
2419 netdev_linux_recv_wait, \
2420 netdev_linux_drain, \
2422 netdev_linux_send, \
2423 netdev_linux_send_wait, \
2425 netdev_linux_set_etheraddr, \
2426 netdev_linux_get_etheraddr, \
2427 netdev_linux_get_mtu, \
2428 netdev_linux_set_mtu, \
2429 netdev_linux_get_ifindex, \
2430 netdev_linux_get_carrier, \
2431 netdev_linux_get_carrier_resets, \
2432 netdev_linux_set_miimon_interval, \
2437 netdev_linux_set_advertisements, \
2439 netdev_linux_set_policing, \
2440 netdev_linux_get_qos_types, \
2441 netdev_linux_get_qos_capabilities, \
2442 netdev_linux_get_qos, \
2443 netdev_linux_set_qos, \
2444 netdev_linux_get_queue, \
2445 netdev_linux_set_queue, \
2446 netdev_linux_delete_queue, \
2447 netdev_linux_get_queue_stats, \
2448 netdev_linux_dump_queues, \
2449 netdev_linux_dump_queue_stats, \
2451 netdev_linux_get_in4, \
2452 netdev_linux_set_in4, \
2453 netdev_linux_get_in6, \
2454 netdev_linux_add_router, \
2455 netdev_linux_get_next_hop, \
2457 netdev_linux_arp_lookup, \
2459 netdev_linux_update_flags, \
2461 netdev_linux_change_seq \
2464 const struct netdev_class netdev_linux_class =
2467 netdev_linux_create,
2468 netdev_linux_get_stats,
2469 NULL, /* set_stats */
2470 netdev_linux_get_features,
2471 netdev_linux_get_drv_info);
2473 const struct netdev_class netdev_tap_class =
2476 netdev_linux_create_tap,
2477 netdev_tap_get_stats,
2478 NULL, /* set_stats */
2479 netdev_linux_get_features,
2480 netdev_linux_get_drv_info);
2482 const struct netdev_class netdev_internal_class =
2485 netdev_linux_create,
2486 netdev_internal_get_stats,
2487 netdev_vport_set_stats,
2488 NULL, /* get_features */
2489 netdev_internal_get_drv_info);
2491 /* HTB traffic control class. */
2493 #define HTB_N_QUEUES 0xf000
2497 unsigned int max_rate; /* In bytes/s. */
2501 struct tc_queue tc_queue;
2502 unsigned int min_rate; /* In bytes/s. */
2503 unsigned int max_rate; /* In bytes/s. */
2504 unsigned int burst; /* In bytes. */
2505 unsigned int priority; /* Lower values are higher priorities. */
2509 htb_get__(const struct netdev *netdev)
2511 struct netdev_dev_linux *netdev_dev =
2512 netdev_dev_linux_cast(netdev_get_dev(netdev));
2513 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2517 htb_install__(struct netdev *netdev, uint64_t max_rate)
2519 struct netdev_dev_linux *netdev_dev =
2520 netdev_dev_linux_cast(netdev_get_dev(netdev));
2523 htb = xmalloc(sizeof *htb);
2524 tc_init(&htb->tc, &tc_ops_htb);
2525 htb->max_rate = max_rate;
2527 netdev_dev->tc = &htb->tc;
2530 /* Create an HTB qdisc.
2532 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2534 htb_setup_qdisc__(struct netdev *netdev)
2537 struct tc_htb_glob opt;
2538 struct ofpbuf request;
2539 struct tcmsg *tcmsg;
2541 tc_del_qdisc(netdev);
2543 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2544 NLM_F_EXCL | NLM_F_CREATE, &request);
2548 tcmsg->tcm_handle = tc_make_handle(1, 0);
2549 tcmsg->tcm_parent = TC_H_ROOT;
2551 nl_msg_put_string(&request, TCA_KIND, "htb");
2553 memset(&opt, 0, sizeof opt);
2554 opt.rate2quantum = 10;
2558 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2559 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2560 nl_msg_end_nested(&request, opt_offset);
2562 return tc_transact(&request, NULL);
2565 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2566 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2568 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2569 unsigned int parent, struct htb_class *class)
2572 struct tc_htb_opt opt;
2573 struct ofpbuf request;
2574 struct tcmsg *tcmsg;
2578 error = netdev_get_mtu(netdev, &mtu);
2580 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2581 netdev_get_name(netdev));
2585 memset(&opt, 0, sizeof opt);
2586 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2587 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2588 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2589 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2590 opt.prio = class->priority;
2592 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2596 tcmsg->tcm_handle = handle;
2597 tcmsg->tcm_parent = parent;
2599 nl_msg_put_string(&request, TCA_KIND, "htb");
2600 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2601 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2602 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2603 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2604 nl_msg_end_nested(&request, opt_offset);
2606 error = tc_transact(&request, NULL);
2608 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2609 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2610 netdev_get_name(netdev),
2611 tc_get_major(handle), tc_get_minor(handle),
2612 tc_get_major(parent), tc_get_minor(parent),
2613 class->min_rate, class->max_rate,
2614 class->burst, class->priority, strerror(error));
2619 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2620 * description of them into 'details'. The description complies with the
2621 * specification given in the vswitch database documentation for linux-htb
2624 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2626 static const struct nl_policy tca_htb_policy[] = {
2627 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2628 .min_len = sizeof(struct tc_htb_opt) },
2631 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2632 const struct tc_htb_opt *htb;
2634 if (!nl_parse_nested(nl_options, tca_htb_policy,
2635 attrs, ARRAY_SIZE(tca_htb_policy))) {
2636 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2640 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2641 class->min_rate = htb->rate.rate;
2642 class->max_rate = htb->ceil.rate;
2643 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2644 class->priority = htb->prio;
2649 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2650 struct htb_class *options,
2651 struct netdev_queue_stats *stats)
2653 struct nlattr *nl_options;
2654 unsigned int handle;
2657 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2658 if (!error && queue_id) {
2659 unsigned int major = tc_get_major(handle);
2660 unsigned int minor = tc_get_minor(handle);
2661 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2662 *queue_id = minor - 1;
2667 if (!error && options) {
2668 error = htb_parse_tca_options__(nl_options, options);
2674 htb_parse_qdisc_details__(struct netdev *netdev,
2675 const struct smap *details, struct htb_class *hc)
2677 const char *max_rate_s;
2679 max_rate_s = smap_get(details, "max-rate");
2680 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2681 if (!hc->max_rate) {
2682 enum netdev_features current;
2684 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2685 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
2687 hc->min_rate = hc->max_rate;
2693 htb_parse_class_details__(struct netdev *netdev,
2694 const struct smap *details, struct htb_class *hc)
2696 const struct htb *htb = htb_get__(netdev);
2697 const char *min_rate_s = smap_get(details, "min-rate");
2698 const char *max_rate_s = smap_get(details, "max-rate");
2699 const char *burst_s = smap_get(details, "burst");
2700 const char *priority_s = smap_get(details, "priority");
2703 error = netdev_get_mtu(netdev, &mtu);
2705 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2706 netdev_get_name(netdev));
2710 /* HTB requires at least an mtu sized min-rate to send any traffic even
2711 * on uncongested links. */
2712 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2713 hc->min_rate = MAX(hc->min_rate, mtu);
2714 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2717 hc->max_rate = (max_rate_s
2718 ? strtoull(max_rate_s, NULL, 10) / 8
2720 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2721 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2725 * According to hints in the documentation that I've read, it is important
2726 * that 'burst' be at least as big as the largest frame that might be
2727 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2728 * but having it a bit too small is a problem. Since netdev_get_mtu()
2729 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2730 * the MTU. We actually add 64, instead of 14, as a guard against
2731 * additional headers get tacked on somewhere that we're not aware of. */
2732 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2733 hc->burst = MAX(hc->burst, mtu + 64);
2736 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2742 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2743 unsigned int parent, struct htb_class *options,
2744 struct netdev_queue_stats *stats)
2746 struct ofpbuf *reply;
2749 error = tc_query_class(netdev, handle, parent, &reply);
2751 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2752 ofpbuf_delete(reply);
2758 htb_tc_install(struct netdev *netdev, const struct smap *details)
2762 error = htb_setup_qdisc__(netdev);
2764 struct htb_class hc;
2766 htb_parse_qdisc_details__(netdev, details, &hc);
2767 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2768 tc_make_handle(1, 0), &hc);
2770 htb_install__(netdev, hc.max_rate);
2776 static struct htb_class *
2777 htb_class_cast__(const struct tc_queue *queue)
2779 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2783 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2784 const struct htb_class *hc)
2786 struct htb *htb = htb_get__(netdev);
2787 size_t hash = hash_int(queue_id, 0);
2788 struct tc_queue *queue;
2789 struct htb_class *hcp;
2791 queue = tc_find_queue__(netdev, queue_id, hash);
2793 hcp = htb_class_cast__(queue);
2795 hcp = xmalloc(sizeof *hcp);
2796 queue = &hcp->tc_queue;
2797 queue->queue_id = queue_id;
2798 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2801 hcp->min_rate = hc->min_rate;
2802 hcp->max_rate = hc->max_rate;
2803 hcp->burst = hc->burst;
2804 hcp->priority = hc->priority;
2808 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2811 struct nl_dump dump;
2812 struct htb_class hc;
2814 /* Get qdisc options. */
2816 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2817 htb_install__(netdev, hc.max_rate);
2820 if (!start_queue_dump(netdev, &dump)) {
2823 while (nl_dump_next(&dump, &msg)) {
2824 unsigned int queue_id;
2826 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2827 htb_update_queue__(netdev, queue_id, &hc);
2830 nl_dump_done(&dump);
2836 htb_tc_destroy(struct tc *tc)
2838 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2839 struct htb_class *hc, *next;
2841 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2842 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2850 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2852 const struct htb *htb = htb_get__(netdev);
2853 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2858 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2860 struct htb_class hc;
2863 htb_parse_qdisc_details__(netdev, details, &hc);
2864 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2865 tc_make_handle(1, 0), &hc);
2867 htb_get__(netdev)->max_rate = hc.max_rate;
2873 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2874 const struct tc_queue *queue, struct smap *details)
2876 const struct htb_class *hc = htb_class_cast__(queue);
2878 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2879 if (hc->min_rate != hc->max_rate) {
2880 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2882 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2884 smap_add_format(details, "priority", "%u", hc->priority);
2890 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2891 const struct smap *details)
2893 struct htb_class hc;
2896 error = htb_parse_class_details__(netdev, details, &hc);
2901 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2902 tc_make_handle(1, 0xfffe), &hc);
2907 htb_update_queue__(netdev, queue_id, &hc);
2912 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2914 struct htb_class *hc = htb_class_cast__(queue);
2915 struct htb *htb = htb_get__(netdev);
2918 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2920 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2927 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2928 struct netdev_queue_stats *stats)
2930 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2931 tc_make_handle(1, 0xfffe), NULL, stats);
2935 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2936 const struct ofpbuf *nlmsg,
2937 netdev_dump_queue_stats_cb *cb, void *aux)
2939 struct netdev_queue_stats stats;
2940 unsigned int handle, major, minor;
2943 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2948 major = tc_get_major(handle);
2949 minor = tc_get_minor(handle);
2950 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2951 (*cb)(minor - 1, &stats, aux);
2956 static const struct tc_ops tc_ops_htb = {
2957 "htb", /* linux_name */
2958 "linux-htb", /* ovs_name */
2959 HTB_N_QUEUES, /* n_queues */
2968 htb_class_get_stats,
2969 htb_class_dump_stats
2972 /* "linux-hfsc" traffic control class. */
2974 #define HFSC_N_QUEUES 0xf000
2982 struct tc_queue tc_queue;
2987 static struct hfsc *
2988 hfsc_get__(const struct netdev *netdev)
2990 struct netdev_dev_linux *netdev_dev;
2991 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2992 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2995 static struct hfsc_class *
2996 hfsc_class_cast__(const struct tc_queue *queue)
2998 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3002 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3004 struct netdev_dev_linux * netdev_dev;
3007 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3008 hfsc = xmalloc(sizeof *hfsc);
3009 tc_init(&hfsc->tc, &tc_ops_hfsc);
3010 hfsc->max_rate = max_rate;
3011 netdev_dev->tc = &hfsc->tc;
3015 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3016 const struct hfsc_class *hc)
3020 struct hfsc_class *hcp;
3021 struct tc_queue *queue;
3023 hfsc = hfsc_get__(netdev);
3024 hash = hash_int(queue_id, 0);
3026 queue = tc_find_queue__(netdev, queue_id, hash);
3028 hcp = hfsc_class_cast__(queue);
3030 hcp = xmalloc(sizeof *hcp);
3031 queue = &hcp->tc_queue;
3032 queue->queue_id = queue_id;
3033 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3036 hcp->min_rate = hc->min_rate;
3037 hcp->max_rate = hc->max_rate;
3041 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3043 const struct tc_service_curve *rsc, *fsc, *usc;
3044 static const struct nl_policy tca_hfsc_policy[] = {
3046 .type = NL_A_UNSPEC,
3048 .min_len = sizeof(struct tc_service_curve),
3051 .type = NL_A_UNSPEC,
3053 .min_len = sizeof(struct tc_service_curve),
3056 .type = NL_A_UNSPEC,
3058 .min_len = sizeof(struct tc_service_curve),
3061 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3063 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3064 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3065 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3069 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3070 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3071 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3073 if (rsc->m1 != 0 || rsc->d != 0 ||
3074 fsc->m1 != 0 || fsc->d != 0 ||
3075 usc->m1 != 0 || usc->d != 0) {
3076 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3077 "Non-linear service curves are not supported.");
3081 if (rsc->m2 != fsc->m2) {
3082 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3083 "Real-time service curves are not supported ");
3087 if (rsc->m2 > usc->m2) {
3088 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3089 "Min-rate service curve is greater than "
3090 "the max-rate service curve.");
3094 class->min_rate = fsc->m2;
3095 class->max_rate = usc->m2;
3100 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3101 struct hfsc_class *options,
3102 struct netdev_queue_stats *stats)
3105 unsigned int handle;
3106 struct nlattr *nl_options;
3108 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3114 unsigned int major, minor;
3116 major = tc_get_major(handle);
3117 minor = tc_get_minor(handle);
3118 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3119 *queue_id = minor - 1;
3126 error = hfsc_parse_tca_options__(nl_options, options);
3133 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3134 unsigned int parent, struct hfsc_class *options,
3135 struct netdev_queue_stats *stats)
3138 struct ofpbuf *reply;
3140 error = tc_query_class(netdev, handle, parent, &reply);
3145 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3146 ofpbuf_delete(reply);
3151 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3152 struct hfsc_class *class)
3155 const char *max_rate_s;
3157 max_rate_s = smap_get(details, "max-rate");
3158 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3161 enum netdev_features current;
3163 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3164 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
3167 class->min_rate = max_rate;
3168 class->max_rate = max_rate;
3172 hfsc_parse_class_details__(struct netdev *netdev,
3173 const struct smap *details,
3174 struct hfsc_class * class)
3176 const struct hfsc *hfsc;
3177 uint32_t min_rate, max_rate;
3178 const char *min_rate_s, *max_rate_s;
3180 hfsc = hfsc_get__(netdev);
3181 min_rate_s = smap_get(details, "min-rate");
3182 max_rate_s = smap_get(details, "max-rate");
3184 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3185 min_rate = MAX(min_rate, 1);
3186 min_rate = MIN(min_rate, hfsc->max_rate);
3188 max_rate = (max_rate_s
3189 ? strtoull(max_rate_s, NULL, 10) / 8
3191 max_rate = MAX(max_rate, min_rate);
3192 max_rate = MIN(max_rate, hfsc->max_rate);
3194 class->min_rate = min_rate;
3195 class->max_rate = max_rate;
3200 /* Create an HFSC qdisc.
3202 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3204 hfsc_setup_qdisc__(struct netdev * netdev)
3206 struct tcmsg *tcmsg;
3207 struct ofpbuf request;
3208 struct tc_hfsc_qopt opt;
3210 tc_del_qdisc(netdev);
3212 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3213 NLM_F_EXCL | NLM_F_CREATE, &request);
3219 tcmsg->tcm_handle = tc_make_handle(1, 0);
3220 tcmsg->tcm_parent = TC_H_ROOT;
3222 memset(&opt, 0, sizeof opt);
3225 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3226 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3228 return tc_transact(&request, NULL);
3231 /* Create an HFSC class.
3233 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3234 * sc rate <min_rate> ul rate <max_rate>" */
3236 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3237 unsigned int parent, struct hfsc_class *class)
3241 struct tcmsg *tcmsg;
3242 struct ofpbuf request;
3243 struct tc_service_curve min, max;
3245 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3251 tcmsg->tcm_handle = handle;
3252 tcmsg->tcm_parent = parent;
3256 min.m2 = class->min_rate;
3260 max.m2 = class->max_rate;
3262 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3263 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3264 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3265 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3266 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3267 nl_msg_end_nested(&request, opt_offset);
3269 error = tc_transact(&request, NULL);
3271 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3272 "min-rate %ubps, max-rate %ubps (%s)",
3273 netdev_get_name(netdev),
3274 tc_get_major(handle), tc_get_minor(handle),
3275 tc_get_major(parent), tc_get_minor(parent),
3276 class->min_rate, class->max_rate, strerror(error));
3283 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3286 struct hfsc_class class;
3288 error = hfsc_setup_qdisc__(netdev);
3294 hfsc_parse_qdisc_details__(netdev, details, &class);
3295 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3296 tc_make_handle(1, 0), &class);
3302 hfsc_install__(netdev, class.max_rate);
3307 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3310 struct nl_dump dump;
3311 struct hfsc_class hc;
3314 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3315 hfsc_install__(netdev, hc.max_rate);
3317 if (!start_queue_dump(netdev, &dump)) {
3321 while (nl_dump_next(&dump, &msg)) {
3322 unsigned int queue_id;
3324 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3325 hfsc_update_queue__(netdev, queue_id, &hc);
3329 nl_dump_done(&dump);
3334 hfsc_tc_destroy(struct tc *tc)
3337 struct hfsc_class *hc, *next;
3339 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3341 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3342 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3351 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3353 const struct hfsc *hfsc;
3354 hfsc = hfsc_get__(netdev);
3355 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3360 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3363 struct hfsc_class class;
3365 hfsc_parse_qdisc_details__(netdev, details, &class);
3366 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3367 tc_make_handle(1, 0), &class);
3370 hfsc_get__(netdev)->max_rate = class.max_rate;
3377 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3378 const struct tc_queue *queue, struct smap *details)
3380 const struct hfsc_class *hc;
3382 hc = hfsc_class_cast__(queue);
3383 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3384 if (hc->min_rate != hc->max_rate) {
3385 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3391 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3392 const struct smap *details)
3395 struct hfsc_class class;
3397 error = hfsc_parse_class_details__(netdev, details, &class);
3402 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3403 tc_make_handle(1, 0xfffe), &class);
3408 hfsc_update_queue__(netdev, queue_id, &class);
3413 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3417 struct hfsc_class *hc;
3419 hc = hfsc_class_cast__(queue);
3420 hfsc = hfsc_get__(netdev);
3422 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3424 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3431 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3432 struct netdev_queue_stats *stats)
3434 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3435 tc_make_handle(1, 0xfffe), NULL, stats);
3439 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3440 const struct ofpbuf *nlmsg,
3441 netdev_dump_queue_stats_cb *cb, void *aux)
3443 struct netdev_queue_stats stats;
3444 unsigned int handle, major, minor;
3447 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3452 major = tc_get_major(handle);
3453 minor = tc_get_minor(handle);
3454 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3455 (*cb)(minor - 1, &stats, aux);
3460 static const struct tc_ops tc_ops_hfsc = {
3461 "hfsc", /* linux_name */
3462 "linux-hfsc", /* ovs_name */
3463 HFSC_N_QUEUES, /* n_queues */
3464 hfsc_tc_install, /* tc_install */
3465 hfsc_tc_load, /* tc_load */
3466 hfsc_tc_destroy, /* tc_destroy */
3467 hfsc_qdisc_get, /* qdisc_get */
3468 hfsc_qdisc_set, /* qdisc_set */
3469 hfsc_class_get, /* class_get */
3470 hfsc_class_set, /* class_set */
3471 hfsc_class_delete, /* class_delete */
3472 hfsc_class_get_stats, /* class_get_stats */
3473 hfsc_class_dump_stats /* class_dump_stats */
3476 /* "linux-default" traffic control class.
3478 * This class represents the default, unnamed Linux qdisc. It corresponds to
3479 * the "" (empty string) QoS type in the OVS database. */
3482 default_install__(struct netdev *netdev)
3484 struct netdev_dev_linux *netdev_dev =
3485 netdev_dev_linux_cast(netdev_get_dev(netdev));
3486 static struct tc *tc;
3489 tc = xmalloc(sizeof *tc);
3490 tc_init(tc, &tc_ops_default);
3492 netdev_dev->tc = tc;
3496 default_tc_install(struct netdev *netdev,
3497 const struct smap *details OVS_UNUSED)
3499 default_install__(netdev);
3504 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3506 default_install__(netdev);
3510 static const struct tc_ops tc_ops_default = {
3511 NULL, /* linux_name */
3516 NULL, /* tc_destroy */
3517 NULL, /* qdisc_get */
3518 NULL, /* qdisc_set */
3519 NULL, /* class_get */
3520 NULL, /* class_set */
3521 NULL, /* class_delete */
3522 NULL, /* class_get_stats */
3523 NULL /* class_dump_stats */
3526 /* "linux-other" traffic control class.
3531 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3533 struct netdev_dev_linux *netdev_dev =
3534 netdev_dev_linux_cast(netdev_get_dev(netdev));
3535 static struct tc *tc;
3538 tc = xmalloc(sizeof *tc);
3539 tc_init(tc, &tc_ops_other);
3541 netdev_dev->tc = tc;
3545 static const struct tc_ops tc_ops_other = {
3546 NULL, /* linux_name */
3547 "linux-other", /* ovs_name */
3549 NULL, /* tc_install */
3551 NULL, /* tc_destroy */
3552 NULL, /* qdisc_get */
3553 NULL, /* qdisc_set */
3554 NULL, /* class_get */
3555 NULL, /* class_set */
3556 NULL, /* class_delete */
3557 NULL, /* class_get_stats */
3558 NULL /* class_dump_stats */
3561 /* Traffic control. */
3563 /* Number of kernel "tc" ticks per second. */
3564 static double ticks_per_s;
3566 /* Number of kernel "jiffies" per second. This is used for the purpose of
3567 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3568 * one jiffy's worth of data.
3570 * There are two possibilities here:
3572 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3573 * approximate range of 100 to 1024. That means that we really need to
3574 * make sure that the qdisc can buffer that much data.
3576 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3577 * has finely granular timers and there's no need to fudge additional room
3578 * for buffers. (There's no extra effort needed to implement that: the
3579 * large 'buffer_hz' is used as a divisor, so practically any number will
3580 * come out as 0 in the division. Small integer results in the case of
3581 * really high dividends won't have any real effect anyhow.)
3583 static unsigned int buffer_hz;
3585 /* Returns tc handle 'major':'minor'. */
3587 tc_make_handle(unsigned int major, unsigned int minor)
3589 return TC_H_MAKE(major << 16, minor);
3592 /* Returns the major number from 'handle'. */
3594 tc_get_major(unsigned int handle)
3596 return TC_H_MAJ(handle) >> 16;
3599 /* Returns the minor number from 'handle'. */
3601 tc_get_minor(unsigned int handle)
3603 return TC_H_MIN(handle);
3606 static struct tcmsg *
3607 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3608 struct ofpbuf *request)
3610 struct tcmsg *tcmsg;
3614 error = get_ifindex(netdev, &ifindex);
3619 ofpbuf_init(request, 512);
3620 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3621 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3622 tcmsg->tcm_family = AF_UNSPEC;
3623 tcmsg->tcm_ifindex = ifindex;
3624 /* Caller should fill in tcmsg->tcm_handle. */
3625 /* Caller should fill in tcmsg->tcm_parent. */
3631 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3633 int error = nl_sock_transact(rtnl_sock, request, replyp);
3634 ofpbuf_uninit(request);
3638 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3639 * policing configuration.
3641 * This function is equivalent to running the following when 'add' is true:
3642 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3644 * This function is equivalent to running the following when 'add' is false:
3645 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3647 * The configuration and stats may be seen with the following command:
3648 * /sbin/tc -s qdisc show dev <devname>
3650 * Returns 0 if successful, otherwise a positive errno value.
3653 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3655 struct ofpbuf request;
3656 struct tcmsg *tcmsg;
3658 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3659 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3661 tcmsg = tc_make_request(netdev, type, flags, &request);
3665 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3666 tcmsg->tcm_parent = TC_H_INGRESS;
3667 nl_msg_put_string(&request, TCA_KIND, "ingress");
3668 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3670 error = tc_transact(&request, NULL);
3672 /* If we're deleting the qdisc, don't worry about some of the
3673 * error conditions. */
3674 if (!add && (error == ENOENT || error == EINVAL)) {
3683 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3686 * This function is equivalent to running:
3687 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3688 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3691 * The configuration and stats may be seen with the following command:
3692 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3694 * Returns 0 if successful, otherwise a positive errno value.
3697 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3699 struct tc_police tc_police;
3700 struct ofpbuf request;
3701 struct tcmsg *tcmsg;
3702 size_t basic_offset;
3703 size_t police_offset;
3707 memset(&tc_police, 0, sizeof tc_police);
3708 tc_police.action = TC_POLICE_SHOT;
3709 tc_police.mtu = mtu;
3710 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3711 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3712 kbits_burst * 1024);
3714 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3715 NLM_F_EXCL | NLM_F_CREATE, &request);
3719 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3720 tcmsg->tcm_info = tc_make_handle(49,
3721 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3723 nl_msg_put_string(&request, TCA_KIND, "basic");
3724 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3725 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3726 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3727 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3728 nl_msg_end_nested(&request, police_offset);
3729 nl_msg_end_nested(&request, basic_offset);
3731 error = tc_transact(&request, NULL);
3742 /* The values in psched are not individually very meaningful, but they are
3743 * important. The tables below show some values seen in the wild.
3747 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3748 * (Before that, there are hints that it was 1000000000.)
3750 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3754 * -----------------------------------
3755 * [1] 000c8000 000f4240 000f4240 00000064
3756 * [2] 000003e8 00000400 000f4240 3b9aca00
3757 * [3] 000003e8 00000400 000f4240 3b9aca00
3758 * [4] 000003e8 00000400 000f4240 00000064
3759 * [5] 000003e8 00000040 000f4240 3b9aca00
3760 * [6] 000003e8 00000040 000f4240 000000f9
3762 * a b c d ticks_per_s buffer_hz
3763 * ------- --------- ---------- ------------- ----------- -------------
3764 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3765 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3766 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3767 * [4] 1,000 1,024 1,000,000 100 976,562 100
3768 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3769 * [6] 1,000 64 1,000,000 249 15,625,000 249
3771 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3772 * [2] 2.6.26-1-686-bigmem from Debian lenny
3773 * [3] 2.6.26-2-sparc64 from Debian lenny
3774 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3775 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3776 * [6] 2.6.34 from kernel.org on KVM
3778 static const char fn[] = "/proc/net/psched";
3779 unsigned int a, b, c, d;
3785 stream = fopen(fn, "r");
3787 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3791 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3792 VLOG_WARN("%s: read failed", fn);
3796 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3800 VLOG_WARN("%s: invalid scheduler parameters", fn);
3804 ticks_per_s = (double) a * c / b;
3808 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3811 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3814 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3815 * rate of 'rate' bytes per second. */
3817 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3822 return (rate * ticks) / ticks_per_s;
3825 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3826 * rate of 'rate' bytes per second. */
3828 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3833 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3836 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3837 * a transmission rate of 'rate' bytes per second. */
3839 tc_buffer_per_jiffy(unsigned int rate)
3844 return rate / buffer_hz;
3847 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3848 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3849 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3850 * stores NULL into it if it is absent.
3852 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3855 * Returns 0 if successful, otherwise a positive errno value. */
3857 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3858 struct nlattr **options)
3860 static const struct nl_policy tca_policy[] = {
3861 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3862 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3864 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3866 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3867 tca_policy, ta, ARRAY_SIZE(ta))) {
3868 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3873 *kind = nl_attr_get_string(ta[TCA_KIND]);
3877 *options = ta[TCA_OPTIONS];
3892 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3893 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3894 * into '*options', and its queue statistics into '*stats'. Any of the output
3895 * arguments may be null.
3897 * Returns 0 if successful, otherwise a positive errno value. */
3899 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3900 struct nlattr **options, struct netdev_queue_stats *stats)
3902 static const struct nl_policy tca_policy[] = {
3903 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3904 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3906 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3908 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3909 tca_policy, ta, ARRAY_SIZE(ta))) {
3910 VLOG_WARN_RL(&rl, "failed to parse class message");
3915 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3916 *handlep = tc->tcm_handle;
3920 *options = ta[TCA_OPTIONS];
3924 const struct gnet_stats_queue *gsq;
3925 struct gnet_stats_basic gsb;
3927 static const struct nl_policy stats_policy[] = {
3928 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3929 .min_len = sizeof gsb },
3930 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3931 .min_len = sizeof *gsq },
3933 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3935 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3936 sa, ARRAY_SIZE(sa))) {
3937 VLOG_WARN_RL(&rl, "failed to parse class stats");
3941 /* Alignment issues screw up the length of struct gnet_stats_basic on
3942 * some arch/bitsize combinations. Newer versions of Linux have a
3943 * struct gnet_stats_basic_packed, but we can't depend on that. The
3944 * easiest thing to do is just to make a copy. */
3945 memset(&gsb, 0, sizeof gsb);
3946 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3947 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3948 stats->tx_bytes = gsb.bytes;
3949 stats->tx_packets = gsb.packets;
3951 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3952 stats->tx_errors = gsq->drops;
3962 memset(stats, 0, sizeof *stats);
3967 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3970 tc_query_class(const struct netdev *netdev,
3971 unsigned int handle, unsigned int parent,
3972 struct ofpbuf **replyp)
3974 struct ofpbuf request;
3975 struct tcmsg *tcmsg;
3978 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3982 tcmsg->tcm_handle = handle;
3983 tcmsg->tcm_parent = parent;
3985 error = tc_transact(&request, replyp);
3987 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3988 netdev_get_name(netdev),
3989 tc_get_major(handle), tc_get_minor(handle),
3990 tc_get_major(parent), tc_get_minor(parent),
3996 /* Equivalent to "tc class del dev <name> handle <handle>". */
3998 tc_delete_class(const struct netdev *netdev, unsigned int handle)
4000 struct ofpbuf request;
4001 struct tcmsg *tcmsg;
4004 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
4008 tcmsg->tcm_handle = handle;
4009 tcmsg->tcm_parent = 0;
4011 error = tc_transact(&request, NULL);
4013 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4014 netdev_get_name(netdev),
4015 tc_get_major(handle), tc_get_minor(handle),
4021 /* Equivalent to "tc qdisc del dev <name> root". */
4023 tc_del_qdisc(struct netdev *netdev)
4025 struct netdev_dev_linux *netdev_dev =
4026 netdev_dev_linux_cast(netdev_get_dev(netdev));
4027 struct ofpbuf request;
4028 struct tcmsg *tcmsg;
4031 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4035 tcmsg->tcm_handle = tc_make_handle(1, 0);
4036 tcmsg->tcm_parent = TC_H_ROOT;
4038 error = tc_transact(&request, NULL);
4039 if (error == EINVAL) {
4040 /* EINVAL probably means that the default qdisc was in use, in which
4041 * case we've accomplished our purpose. */
4044 if (!error && netdev_dev->tc) {
4045 if (netdev_dev->tc->ops->tc_destroy) {
4046 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4048 netdev_dev->tc = NULL;
4053 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4054 * kernel to determine what they are. Returns 0 if successful, otherwise a
4055 * positive errno value. */
4057 tc_query_qdisc(const struct netdev *netdev)
4059 struct netdev_dev_linux *netdev_dev =
4060 netdev_dev_linux_cast(netdev_get_dev(netdev));
4061 struct ofpbuf request, *qdisc;
4062 const struct tc_ops *ops;
4063 struct tcmsg *tcmsg;
4067 if (netdev_dev->tc) {
4071 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4072 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4073 * 2.6.35 without that fix backported to it.
4075 * To avoid the OOPS, we must not make a request that would attempt to dump
4076 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4077 * few others. There are a few ways that I can see to do this, but most of
4078 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4079 * technique chosen here is to assume that any non-default qdisc that we
4080 * create will have a class with handle 1:0. The built-in qdiscs only have
4081 * a class with handle 0:0.
4083 * We could check for Linux 2.6.35+ and use a more straightforward method
4085 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4089 tcmsg->tcm_handle = tc_make_handle(1, 0);
4090 tcmsg->tcm_parent = 0;
4092 /* Figure out what tc class to instantiate. */
4093 error = tc_transact(&request, &qdisc);
4097 error = tc_parse_qdisc(qdisc, &kind, NULL);
4099 ops = &tc_ops_other;
4101 ops = tc_lookup_linux_name(kind);
4103 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4104 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4106 ops = &tc_ops_other;
4109 } else if (error == ENOENT) {
4110 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4111 * other entity that doesn't have a handle 1:0. We will assume
4112 * that it's the system default qdisc. */
4113 ops = &tc_ops_default;
4116 /* Who knows? Maybe the device got deleted. */
4117 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4118 netdev_get_name(netdev), strerror(error));
4119 ops = &tc_ops_other;
4122 /* Instantiate it. */
4123 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
4124 assert((load_error == 0) == (netdev_dev->tc != NULL));
4125 ofpbuf_delete(qdisc);
4127 return error ? error : load_error;
4130 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4131 approximate the time to transmit packets of various lengths. For an MTU of
4132 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4133 represents two possible packet lengths; for a MTU of 513 through 1024, four
4134 possible lengths; and so on.
4136 Returns, for the specified 'mtu', the number of bits that packet lengths
4137 need to be shifted right to fit within such a 256-entry table. */
4139 tc_calc_cell_log(unsigned int mtu)
4144 mtu = ETH_PAYLOAD_MAX;
4146 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4148 for (cell_log = 0; mtu >= 256; cell_log++) {
4155 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4158 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4160 memset(rate, 0, sizeof *rate);
4161 rate->cell_log = tc_calc_cell_log(mtu);
4162 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4163 /* rate->cell_align = 0; */ /* distro headers. */
4164 rate->mpu = ETH_TOTAL_MIN;
4168 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4169 * attribute of the specified "type".
4171 * See tc_calc_cell_log() above for a description of "rtab"s. */
4173 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4178 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4179 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4180 unsigned packet_size = (i + 1) << rate->cell_log;
4181 if (packet_size < rate->mpu) {
4182 packet_size = rate->mpu;
4184 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4188 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4189 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4190 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4193 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4195 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4196 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4199 /* Linux-only functions declared in netdev-linux.h */
4201 /* Returns a fd for an AF_INET socket or a negative errno value. */
4203 netdev_linux_get_af_inet_sock(void)
4205 int error = netdev_linux_init();
4206 return error ? -error : af_inet_sock;
4209 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4210 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4212 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4213 const char *flag_name, bool enable)
4215 const char *netdev_name = netdev_get_name(netdev);
4216 struct ethtool_value evalue;
4220 COVERAGE_INC(netdev_get_ethtool);
4221 memset(&evalue, 0, sizeof evalue);
4222 error = netdev_linux_do_ethtool(netdev_name,
4223 (struct ethtool_cmd *)&evalue,
4224 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4229 COVERAGE_INC(netdev_set_ethtool);
4230 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4231 error = netdev_linux_do_ethtool(netdev_name,
4232 (struct ethtool_cmd *)&evalue,
4233 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4238 COVERAGE_INC(netdev_get_ethtool);
4239 memset(&evalue, 0, sizeof evalue);
4240 error = netdev_linux_do_ethtool(netdev_name,
4241 (struct ethtool_cmd *)&evalue,
4242 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4247 if (new_flags != evalue.data) {
4248 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4249 "device %s failed", enable ? "enable" : "disable",
4250 flag_name, netdev_name);
4257 /* Utility functions. */
4259 /* Copies 'src' into 'dst', performing format conversion in the process. */
4261 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4262 const struct rtnl_link_stats *src)
4264 dst->rx_packets = src->rx_packets;
4265 dst->tx_packets = src->tx_packets;
4266 dst->rx_bytes = src->rx_bytes;
4267 dst->tx_bytes = src->tx_bytes;
4268 dst->rx_errors = src->rx_errors;
4269 dst->tx_errors = src->tx_errors;
4270 dst->rx_dropped = src->rx_dropped;
4271 dst->tx_dropped = src->tx_dropped;
4272 dst->multicast = src->multicast;
4273 dst->collisions = src->collisions;
4274 dst->rx_length_errors = src->rx_length_errors;
4275 dst->rx_over_errors = src->rx_over_errors;
4276 dst->rx_crc_errors = src->rx_crc_errors;
4277 dst->rx_frame_errors = src->rx_frame_errors;
4278 dst->rx_fifo_errors = src->rx_fifo_errors;
4279 dst->rx_missed_errors = src->rx_missed_errors;
4280 dst->tx_aborted_errors = src->tx_aborted_errors;
4281 dst->tx_carrier_errors = src->tx_carrier_errors;
4282 dst->tx_fifo_errors = src->tx_fifo_errors;
4283 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4284 dst->tx_window_errors = src->tx_window_errors;
4288 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4290 /* Policy for RTNLGRP_LINK messages.
4292 * There are *many* more fields in these messages, but currently we only
4293 * care about these fields. */
4294 static const struct nl_policy rtnlgrp_link_policy[] = {
4295 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4296 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4297 .min_len = sizeof(struct rtnl_link_stats) },
4300 struct ofpbuf request;
4301 struct ofpbuf *reply;
4302 struct ifinfomsg *ifi;
4303 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4306 ofpbuf_init(&request, 0);
4307 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4308 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4309 ifi->ifi_family = PF_UNSPEC;
4310 ifi->ifi_index = ifindex;
4311 error = nl_sock_transact(rtnl_sock, &request, &reply);
4312 ofpbuf_uninit(&request);
4317 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4318 rtnlgrp_link_policy,
4319 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4320 ofpbuf_delete(reply);
4324 if (!attrs[IFLA_STATS]) {
4325 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4326 ofpbuf_delete(reply);
4330 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4332 ofpbuf_delete(reply);
4338 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4340 static const char fn[] = "/proc/net/dev";
4345 stream = fopen(fn, "r");
4347 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4352 while (fgets(line, sizeof line, stream)) {
4355 #define X64 "%"SCNu64
4358 X64 X64 X64 X64 X64 X64 X64 "%*u"
4359 X64 X64 X64 X64 X64 X64 X64 "%*u",
4365 &stats->rx_fifo_errors,
4366 &stats->rx_frame_errors,
4372 &stats->tx_fifo_errors,
4374 &stats->tx_carrier_errors) != 15) {
4375 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4376 } else if (!strcmp(devname, netdev_name)) {
4377 stats->rx_length_errors = UINT64_MAX;
4378 stats->rx_over_errors = UINT64_MAX;
4379 stats->rx_crc_errors = UINT64_MAX;
4380 stats->rx_missed_errors = UINT64_MAX;
4381 stats->tx_aborted_errors = UINT64_MAX;
4382 stats->tx_heartbeat_errors = UINT64_MAX;
4383 stats->tx_window_errors = UINT64_MAX;
4389 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4395 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4401 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4404 *flags = ifr.ifr_flags;
4410 set_flags(struct netdev *netdev, unsigned int flags)
4414 ifr.ifr_flags = flags;
4415 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4420 do_get_ifindex(const char *netdev_name)
4424 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4425 COVERAGE_INC(netdev_get_ifindex);
4426 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4427 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4428 netdev_name, strerror(errno));
4431 return ifr.ifr_ifindex;
4435 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4437 struct netdev_dev_linux *netdev_dev =
4438 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4440 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4441 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4444 netdev_dev->get_ifindex_error = -ifindex;
4445 netdev_dev->ifindex = 0;
4447 netdev_dev->get_ifindex_error = 0;
4448 netdev_dev->ifindex = ifindex;
4450 netdev_dev->cache_valid |= VALID_IFINDEX;
4453 *ifindexp = netdev_dev->ifindex;
4454 return netdev_dev->get_ifindex_error;
4458 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4463 memset(&ifr, 0, sizeof ifr);
4464 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4465 COVERAGE_INC(netdev_get_hwaddr);
4466 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4467 /* ENODEV probably means that a vif disappeared asynchronously and
4468 * hasn't been removed from the database yet, so reduce the log level
4469 * to INFO for that case. */
4470 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4471 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4472 netdev_name, strerror(errno));
4475 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4476 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4477 VLOG_WARN("%s device has unknown hardware address family %d",
4478 netdev_name, hwaddr_family);
4480 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4485 set_etheraddr(const char *netdev_name,
4486 const uint8_t mac[ETH_ADDR_LEN])
4490 memset(&ifr, 0, sizeof ifr);
4491 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4492 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4493 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4494 COVERAGE_INC(netdev_set_hwaddr);
4495 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4496 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4497 netdev_name, strerror(errno));
4504 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4505 int cmd, const char *cmd_name)
4509 memset(&ifr, 0, sizeof ifr);
4510 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4511 ifr.ifr_data = (caddr_t) ecmd;
4514 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4517 if (errno != EOPNOTSUPP) {
4518 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4519 "failed: %s", cmd_name, name, strerror(errno));
4521 /* The device doesn't support this operation. That's pretty
4522 * common, so there's no point in logging anything. */
4529 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4530 const char *cmd_name)
4532 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4533 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4534 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4542 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4543 int cmd, const char *cmd_name)
4548 ifr.ifr_addr.sa_family = AF_INET;
4549 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4551 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4552 *ip = sin->sin_addr;
4557 /* Returns an AF_PACKET raw socket or a negative errno value. */
4559 af_packet_sock(void)
4561 static int sock = INT_MIN;
4563 if (sock == INT_MIN) {
4564 sock = socket(AF_PACKET, SOCK_RAW, 0);
4566 set_nonblocking(sock);
4569 VLOG_ERR("failed to create packet socket: %s", strerror(errno));