2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_get_ethtool);
81 COVERAGE_DEFINE(netdev_set_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct smap *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct smap *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct smap *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct smap *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
395 struct tap_state tap;
399 struct netdev_linux {
400 struct netdev netdev;
404 /* Sockets used for ioctl operations. */
405 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
407 /* A Netlink routing socket that is not subscribed to any multicast groups. */
408 static struct nl_sock *rtnl_sock;
410 /* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
414 static int netdev_linux_init(void);
416 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
417 int cmd, const char *cmd_name);
418 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
420 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
422 static int get_flags(const struct netdev_dev *, unsigned int *flags);
423 static int set_flags(struct netdev *, unsigned int flags);
424 static int do_get_ifindex(const char *netdev_name);
425 static int get_ifindex(const struct netdev *, int *ifindexp);
426 static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
430 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
431 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
433 static int af_packet_sock(void);
434 static void netdev_linux_miimon_run(void);
435 static void netdev_linux_miimon_wait(void);
438 is_netdev_linux_class(const struct netdev_class *netdev_class)
440 return netdev_class->init == netdev_linux_init;
443 static struct netdev_dev_linux *
444 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 assert(is_netdev_linux_class(netdev_class));
449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
452 static struct netdev_linux *
453 netdev_linux_cast(const struct netdev *netdev)
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 assert(is_netdev_linux_class(netdev_class));
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
463 netdev_linux_init(void)
465 static int status = -1;
467 /* Create AF_INET socket. */
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 /* Create rtnetlink socket. */
476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
487 netdev_linux_run(void)
489 rtnetlink_link_run();
490 netdev_linux_miimon_run();
494 netdev_linux_wait(void)
496 rtnetlink_link_wait();
497 netdev_linux_miimon_wait();
501 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
506 if (netdev_dev->cache_valid & VALID_DRVINFO) {
510 COVERAGE_INC(netdev_get_ethtool);
511 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
512 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
513 (struct ethtool_cmd *)&netdev_dev->drvinfo,
517 netdev_dev->cache_valid |= VALID_DRVINFO;
523 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
524 unsigned int ifi_flags,
528 if (!dev->change_seq) {
532 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
533 dev->carrier_resets++;
535 dev->ifi_flags = ifi_flags;
537 dev->cache_valid &= mask;
541 netdev_dev_linux_update(struct netdev_dev_linux *dev,
542 const struct rtnetlink_link_change *change)
544 if (change->nlmsg_type == RTM_NEWLINK) {
546 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
548 /* Update netdev from rtnl-change msg. */
550 dev->mtu = change->mtu;
551 dev->cache_valid |= VALID_MTU;
552 dev->netdev_mtu_error = 0;
555 if (!eth_addr_is_zero(change->addr)) {
556 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
557 dev->cache_valid |= VALID_ETHERADDR;
558 dev->ether_addr_error = 0;
561 dev->ifindex = change->ifi_index;
562 dev->cache_valid |= VALID_IFINDEX;
563 dev->get_ifindex_error = 0;
566 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
571 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
572 void *aux OVS_UNUSED)
574 struct netdev_dev_linux *dev;
576 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
578 const struct netdev_class *netdev_class =
579 netdev_dev_get_class(base_dev);
581 if (is_netdev_linux_class(netdev_class)) {
582 dev = netdev_dev_linux_cast(base_dev);
583 netdev_dev_linux_update(dev, change);
587 struct shash device_shash;
588 struct shash_node *node;
590 shash_init(&device_shash);
591 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
592 SHASH_FOR_EACH (node, &device_shash) {
597 get_flags(&dev->netdev_dev, &flags);
598 netdev_dev_linux_changed(dev, flags, 0);
600 shash_destroy(&device_shash);
605 cache_notifier_ref(void)
607 if (!cache_notifier_refcount) {
608 assert(!netdev_linux_cache_notifier);
610 netdev_linux_cache_notifier =
611 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
613 if (!netdev_linux_cache_notifier) {
617 cache_notifier_refcount++;
623 cache_notifier_unref(void)
625 assert(cache_notifier_refcount > 0);
626 if (!--cache_notifier_refcount) {
627 assert(netdev_linux_cache_notifier);
628 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
629 netdev_linux_cache_notifier = NULL;
633 /* Creates system and internal devices. */
635 netdev_linux_create(const struct netdev_class *class, const char *name,
636 struct netdev_dev **netdev_devp)
638 struct netdev_dev_linux *netdev_dev;
641 error = cache_notifier_ref();
646 netdev_dev = xzalloc(sizeof *netdev_dev);
647 netdev_dev->change_seq = 1;
648 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
649 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
651 *netdev_devp = &netdev_dev->netdev_dev;
655 /* For most types of netdevs we open the device for each call of
656 * netdev_open(). However, this is not the case with tap devices,
657 * since it is only possible to open the device once. In this
658 * situation we share a single file descriptor, and consequently
659 * buffers, across all readers. Therefore once data is read it will
660 * be unavailable to other reads for tap devices. */
662 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
663 const char *name, struct netdev_dev **netdev_devp)
665 struct netdev_dev_linux *netdev_dev;
666 struct tap_state *state;
667 static const char tap_dev[] = "/dev/net/tun";
671 netdev_dev = xzalloc(sizeof *netdev_dev);
672 state = &netdev_dev->state.tap;
674 error = cache_notifier_ref();
679 /* Open tap device. */
680 state->fd = open(tap_dev, O_RDWR);
683 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
684 goto error_unref_notifier;
687 /* Create tap device. */
688 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
689 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
690 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
691 VLOG_WARN("%s: creating tap device failed: %s", name,
694 goto error_unref_notifier;
697 /* Make non-blocking. */
698 error = set_nonblocking(state->fd);
700 goto error_unref_notifier;
703 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
704 *netdev_devp = &netdev_dev->netdev_dev;
707 error_unref_notifier:
708 cache_notifier_unref();
715 destroy_tap(struct netdev_dev_linux *netdev_dev)
717 struct tap_state *state = &netdev_dev->state.tap;
719 if (state->fd >= 0) {
724 /* Destroys the netdev device 'netdev_dev_'. */
726 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
728 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
729 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
731 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
732 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
735 if (class == &netdev_tap_class) {
736 destroy_tap(netdev_dev);
740 cache_notifier_unref();
744 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
746 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
747 struct netdev_linux *netdev;
748 enum netdev_flags flags;
751 /* Allocate network device. */
752 netdev = xzalloc(sizeof *netdev);
754 netdev_init(&netdev->netdev, netdev_dev_);
756 /* Verify that the device really exists, by attempting to read its flags.
757 * (The flags might be cached, in which case this won't actually do an
760 * Don't do this for "internal" netdevs, though, because those have to be
761 * created as netdev objects before they exist in the kernel, because
762 * creating them in the kernel happens by passing a netdev object to
763 * dpif_port_add(). */
764 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
765 error = netdev_get_flags(&netdev->netdev, &flags);
766 if (error == ENODEV) {
771 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
772 !netdev_dev->state.tap.opened) {
774 /* We assume that the first user of the tap device is the primary user
775 * and give them the tap FD. Subsequent users probably just expect
776 * this to be a system device so open it normally to avoid send/receive
777 * directions appearing to be reversed. */
778 netdev->fd = netdev_dev->state.tap.fd;
779 netdev_dev->state.tap.opened = true;
782 *netdevp = &netdev->netdev;
786 netdev_uninit(&netdev->netdev, true);
790 /* Closes and destroys 'netdev'. */
792 netdev_linux_close(struct netdev *netdev_)
794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
803 netdev_linux_listen(struct netdev *netdev_)
805 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
806 struct sockaddr_ll sll;
811 if (netdev->fd >= 0) {
815 /* Create file descriptor. */
816 fd = socket(PF_PACKET, SOCK_RAW, 0);
819 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
823 /* Set non-blocking mode. */
824 error = set_nonblocking(fd);
829 /* Get ethernet device index. */
830 error = get_ifindex(&netdev->netdev, &ifindex);
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
839 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
840 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
843 netdev_get_name(netdev_), strerror(error));
858 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
862 if (netdev->fd < 0) {
863 /* Device is not listening. */
870 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
871 ? read(netdev->fd, data, size)
872 : recv(netdev->fd, data, size, MSG_TRUNC));
874 return retval <= size ? retval : -EMSGSIZE;
875 } else if (errno != EINTR) {
876 if (errno != EAGAIN) {
877 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
878 strerror(errno), netdev_get_name(netdev_));
885 /* Registers with the poll loop to wake up from the next call to poll_block()
886 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
888 netdev_linux_recv_wait(struct netdev *netdev_)
890 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
891 if (netdev->fd >= 0) {
892 poll_fd_wait(netdev->fd, POLLIN);
896 /* Discards all packets waiting to be received from 'netdev'. */
898 netdev_linux_drain(struct netdev *netdev_)
900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
901 if (netdev->fd < 0) {
903 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
905 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
906 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
910 drain_fd(netdev->fd, ifr.ifr_qlen);
913 return drain_rcvbuf(netdev->fd);
917 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
918 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
919 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
920 * the packet is too big or too small to transmit on the device.
922 * The caller retains ownership of 'buffer' in all cases.
924 * The kernel maintains a packet transmission queue, so the caller is not
925 * expected to do additional queuing of packets. */
927 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
929 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
933 if (netdev->fd < 0) {
934 /* Use our AF_PACKET socket to send to this device. */
935 struct sockaddr_ll sll;
942 sock = af_packet_sock();
947 error = get_ifindex(netdev_, &ifindex);
952 /* We don't bother setting most fields in sockaddr_ll because the
953 * kernel ignores them for SOCK_RAW. */
954 memset(&sll, 0, sizeof sll);
955 sll.sll_family = AF_PACKET;
956 sll.sll_ifindex = ifindex;
958 iov.iov_base = (void *) data;
962 msg.msg_namelen = sizeof sll;
965 msg.msg_control = NULL;
966 msg.msg_controllen = 0;
969 retval = sendmsg(sock, &msg, 0);
971 /* Use the netdev's own fd to send to this device. This is
972 * essential for tap devices, because packets sent to a tap device
973 * with an AF_PACKET socket will loop back to be *received* again
974 * on the tap device. */
975 retval = write(netdev->fd, data, size);
979 /* The Linux AF_PACKET implementation never blocks waiting for room
980 * for packets, instead returning ENOBUFS. Translate this into
981 * EAGAIN for the caller. */
982 if (errno == ENOBUFS) {
984 } else if (errno == EINTR) {
986 } else if (errno != EAGAIN) {
987 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
988 netdev_get_name(netdev_), strerror(errno));
991 } else if (retval != size) {
992 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
993 "%zu) on %s", retval, size, netdev_get_name(netdev_));
1001 /* Registers with the poll loop to wake up from the next call to poll_block()
1002 * when the packet transmission queue has sufficient room to transmit a packet
1003 * with netdev_send().
1005 * The kernel maintains a packet transmission queue, so the client is not
1006 * expected to do additional queuing of packets. Thus, this function is
1007 * unlikely to ever be used. It is included for completeness. */
1009 netdev_linux_send_wait(struct netdev *netdev_)
1011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1012 if (netdev->fd < 0) {
1013 /* Nothing to do. */
1014 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1015 poll_fd_wait(netdev->fd, POLLOUT);
1017 /* TAP device always accepts packets.*/
1018 poll_immediate_wake();
1022 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1023 * otherwise a positive errno value. */
1025 netdev_linux_set_etheraddr(struct netdev *netdev_,
1026 const uint8_t mac[ETH_ADDR_LEN])
1028 struct netdev_dev_linux *netdev_dev =
1029 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1032 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1033 if (netdev_dev->ether_addr_error) {
1034 return netdev_dev->ether_addr_error;
1036 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1039 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1042 error = set_etheraddr(netdev_get_name(netdev_), mac);
1043 if (!error || error == ENODEV) {
1044 netdev_dev->ether_addr_error = error;
1045 netdev_dev->cache_valid |= VALID_ETHERADDR;
1047 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1054 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1056 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1057 uint8_t mac[ETH_ADDR_LEN])
1059 struct netdev_dev_linux *netdev_dev =
1060 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1062 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1063 int error = get_etheraddr(netdev_get_name(netdev_),
1064 netdev_dev->etheraddr);
1066 netdev_dev->ether_addr_error = error;
1067 netdev_dev->cache_valid |= VALID_ETHERADDR;
1070 if (!netdev_dev->ether_addr_error) {
1071 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1074 return netdev_dev->ether_addr_error;
1077 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1078 * in bytes, not including the hardware header; thus, this is typically 1500
1079 * bytes for Ethernet devices. */
1081 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1083 struct netdev_dev_linux *netdev_dev =
1084 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1085 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1089 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1090 SIOCGIFMTU, "SIOCGIFMTU");
1092 netdev_dev->netdev_mtu_error = error;
1093 netdev_dev->mtu = ifr.ifr_mtu;
1094 netdev_dev->cache_valid |= VALID_MTU;
1097 if (!netdev_dev->netdev_mtu_error) {
1098 *mtup = netdev_dev->mtu;
1100 return netdev_dev->netdev_mtu_error;
1103 /* Sets the maximum size of transmitted (MTU) for given device using linux
1104 * networking ioctl interface.
1107 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1109 struct netdev_dev_linux *netdev_dev =
1110 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1114 if (netdev_dev->cache_valid & VALID_MTU) {
1115 if (netdev_dev->netdev_mtu_error) {
1116 return netdev_dev->netdev_mtu_error;
1118 if (netdev_dev->mtu == mtu) {
1121 netdev_dev->cache_valid &= ~VALID_MTU;
1124 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1125 SIOCSIFMTU, "SIOCSIFMTU");
1126 if (!error || error == ENODEV) {
1127 netdev_dev->netdev_mtu_error = error;
1128 netdev_dev->mtu = ifr.ifr_mtu;
1129 netdev_dev->cache_valid |= VALID_MTU;
1134 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1135 * On failure, returns a negative errno value. */
1137 netdev_linux_get_ifindex(const struct netdev *netdev)
1141 error = get_ifindex(netdev, &ifindex);
1142 return error ? -error : ifindex;
1146 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1148 struct netdev_dev_linux *netdev_dev =
1149 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1151 if (netdev_dev->miimon_interval > 0) {
1152 *carrier = netdev_dev->miimon;
1154 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1160 static long long int
1161 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1163 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1167 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1168 struct mii_ioctl_data *data)
1173 memset(&ifr, 0, sizeof ifr);
1174 memcpy(&ifr.ifr_data, data, sizeof *data);
1175 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1176 memcpy(data, &ifr.ifr_data, sizeof *data);
1182 netdev_linux_get_miimon(const char *name, bool *miimon)
1184 struct mii_ioctl_data data;
1189 memset(&data, 0, sizeof data);
1190 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1192 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1193 data.reg_num = MII_BMSR;
1194 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1198 *miimon = !!(data.val_out & BMSR_LSTATUS);
1200 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1203 struct ethtool_cmd ecmd;
1205 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1208 COVERAGE_INC(netdev_get_ethtool);
1209 memset(&ecmd, 0, sizeof ecmd);
1210 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1213 struct ethtool_value eval;
1215 memcpy(&eval, &ecmd, sizeof eval);
1216 *miimon = !!eval.data;
1218 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1226 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1227 long long int interval)
1229 struct netdev_dev_linux *netdev_dev;
1231 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1233 interval = interval > 0 ? MAX(interval, 100) : 0;
1234 if (netdev_dev->miimon_interval != interval) {
1235 netdev_dev->miimon_interval = interval;
1236 timer_set_expired(&netdev_dev->miimon_timer);
1243 netdev_linux_miimon_run(void)
1245 struct shash device_shash;
1246 struct shash_node *node;
1248 shash_init(&device_shash);
1249 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1250 SHASH_FOR_EACH (node, &device_shash) {
1251 struct netdev_dev_linux *dev = node->data;
1254 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1258 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1259 if (miimon != dev->miimon) {
1260 dev->miimon = miimon;
1261 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1264 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1267 shash_destroy(&device_shash);
1271 netdev_linux_miimon_wait(void)
1273 struct shash device_shash;
1274 struct shash_node *node;
1276 shash_init(&device_shash);
1277 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1278 SHASH_FOR_EACH (node, &device_shash) {
1279 struct netdev_dev_linux *dev = node->data;
1281 if (dev->miimon_interval > 0) {
1282 timer_wait(&dev->miimon_timer);
1285 shash_destroy(&device_shash);
1288 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1289 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1292 check_for_working_netlink_stats(void)
1294 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1295 * preferable, so if that works, we'll use it. */
1296 int ifindex = do_get_ifindex("lo");
1298 VLOG_WARN("failed to get ifindex for lo, "
1299 "obtaining netdev stats from proc");
1302 struct netdev_stats stats;
1303 int error = get_stats_via_netlink(ifindex, &stats);
1305 VLOG_DBG("obtaining netdev stats via rtnetlink");
1308 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1309 "via proc (you are probably running a pre-2.6.19 "
1310 "kernel)", strerror(error));
1317 swap_uint64(uint64_t *a, uint64_t *b)
1325 get_stats_via_vport(const struct netdev *netdev_,
1326 struct netdev_stats *stats)
1328 struct netdev_dev_linux *netdev_dev =
1329 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1331 if (!netdev_dev->vport_stats_error ||
1332 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1335 error = netdev_vport_get_stats(netdev_, stats);
1337 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1338 "(%s)", netdev_get_name(netdev_), strerror(error));
1340 netdev_dev->vport_stats_error = error;
1341 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1346 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1347 struct netdev_stats *stats)
1349 static int use_netlink_stats = -1;
1352 if (use_netlink_stats < 0) {
1353 use_netlink_stats = check_for_working_netlink_stats();
1356 if (use_netlink_stats) {
1359 error = get_ifindex(netdev_, &ifindex);
1361 error = get_stats_via_netlink(ifindex, stats);
1364 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1368 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1369 netdev_get_name(netdev_), error);
1375 /* Retrieves current device stats for 'netdev-linux'. */
1377 netdev_linux_get_stats(const struct netdev *netdev_,
1378 struct netdev_stats *stats)
1380 struct netdev_dev_linux *netdev_dev =
1381 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1382 struct netdev_stats dev_stats;
1385 get_stats_via_vport(netdev_, stats);
1387 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1390 if (netdev_dev->vport_stats_error) {
1397 if (netdev_dev->vport_stats_error) {
1398 /* stats not available from OVS then use ioctl stats. */
1401 stats->rx_errors += dev_stats.rx_errors;
1402 stats->tx_errors += dev_stats.tx_errors;
1403 stats->rx_dropped += dev_stats.rx_dropped;
1404 stats->tx_dropped += dev_stats.tx_dropped;
1405 stats->multicast += dev_stats.multicast;
1406 stats->collisions += dev_stats.collisions;
1407 stats->rx_length_errors += dev_stats.rx_length_errors;
1408 stats->rx_over_errors += dev_stats.rx_over_errors;
1409 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1410 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1411 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1412 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1413 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1414 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1415 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1416 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1417 stats->tx_window_errors += dev_stats.tx_window_errors;
1422 /* Retrieves current device stats for 'netdev-tap' netdev or
1423 * netdev-internal. */
1425 netdev_tap_get_stats(const struct netdev *netdev_,
1426 struct netdev_stats *stats)
1428 struct netdev_dev_linux *netdev_dev =
1429 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1430 struct netdev_stats dev_stats;
1433 get_stats_via_vport(netdev_, stats);
1435 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1437 if (netdev_dev->vport_stats_error) {
1444 /* If this port is an internal port then the transmit and receive stats
1445 * will appear to be swapped relative to the other ports since we are the
1446 * one sending the data, not a remote computer. For consistency, we swap
1447 * them back here. This does not apply if we are getting stats from the
1448 * vport layer because it always tracks stats from the perspective of the
1450 if (netdev_dev->vport_stats_error) {
1452 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1453 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1454 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1455 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1456 stats->rx_length_errors = 0;
1457 stats->rx_over_errors = 0;
1458 stats->rx_crc_errors = 0;
1459 stats->rx_frame_errors = 0;
1460 stats->rx_fifo_errors = 0;
1461 stats->rx_missed_errors = 0;
1462 stats->tx_aborted_errors = 0;
1463 stats->tx_carrier_errors = 0;
1464 stats->tx_fifo_errors = 0;
1465 stats->tx_heartbeat_errors = 0;
1466 stats->tx_window_errors = 0;
1468 stats->rx_dropped += dev_stats.tx_dropped;
1469 stats->tx_dropped += dev_stats.rx_dropped;
1471 stats->rx_errors += dev_stats.tx_errors;
1472 stats->tx_errors += dev_stats.rx_errors;
1474 stats->multicast += dev_stats.multicast;
1475 stats->collisions += dev_stats.collisions;
1481 netdev_internal_get_stats(const struct netdev *netdev_,
1482 struct netdev_stats *stats)
1484 struct netdev_dev_linux *netdev_dev =
1485 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1487 get_stats_via_vport(netdev_, stats);
1488 return netdev_dev->vport_stats_error;
1492 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1494 struct ethtool_cmd ecmd;
1498 if (netdev_dev->cache_valid & VALID_FEATURES) {
1502 COVERAGE_INC(netdev_get_ethtool);
1503 memset(&ecmd, 0, sizeof ecmd);
1504 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1505 ETHTOOL_GSET, "ETHTOOL_GSET");
1510 /* Supported features. */
1511 netdev_dev->supported = 0;
1512 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1513 netdev_dev->supported |= NETDEV_F_10MB_HD;
1515 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1516 netdev_dev->supported |= NETDEV_F_10MB_FD;
1518 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1519 netdev_dev->supported |= NETDEV_F_100MB_HD;
1521 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1522 netdev_dev->supported |= NETDEV_F_100MB_FD;
1524 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1525 netdev_dev->supported |= NETDEV_F_1GB_HD;
1527 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1528 netdev_dev->supported |= NETDEV_F_1GB_FD;
1530 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1531 netdev_dev->supported |= NETDEV_F_10GB_FD;
1533 if (ecmd.supported & SUPPORTED_TP) {
1534 netdev_dev->supported |= NETDEV_F_COPPER;
1536 if (ecmd.supported & SUPPORTED_FIBRE) {
1537 netdev_dev->supported |= NETDEV_F_FIBER;
1539 if (ecmd.supported & SUPPORTED_Autoneg) {
1540 netdev_dev->supported |= NETDEV_F_AUTONEG;
1542 if (ecmd.supported & SUPPORTED_Pause) {
1543 netdev_dev->supported |= NETDEV_F_PAUSE;
1545 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1546 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1549 /* Advertised features. */
1550 netdev_dev->advertised = 0;
1551 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1552 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1554 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1555 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1557 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1558 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1560 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1561 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1563 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1564 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1566 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1567 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1569 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1570 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1572 if (ecmd.advertising & ADVERTISED_TP) {
1573 netdev_dev->advertised |= NETDEV_F_COPPER;
1575 if (ecmd.advertising & ADVERTISED_FIBRE) {
1576 netdev_dev->advertised |= NETDEV_F_FIBER;
1578 if (ecmd.advertising & ADVERTISED_Autoneg) {
1579 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1581 if (ecmd.advertising & ADVERTISED_Pause) {
1582 netdev_dev->advertised |= NETDEV_F_PAUSE;
1584 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1585 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1588 /* Current settings. */
1590 if (speed == SPEED_10) {
1591 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1592 } else if (speed == SPEED_100) {
1593 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1594 } else if (speed == SPEED_1000) {
1595 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1596 } else if (speed == SPEED_10000) {
1597 netdev_dev->current = NETDEV_F_10GB_FD;
1598 } else if (speed == 40000) {
1599 netdev_dev->current = NETDEV_F_40GB_FD;
1600 } else if (speed == 100000) {
1601 netdev_dev->current = NETDEV_F_100GB_FD;
1602 } else if (speed == 1000000) {
1603 netdev_dev->current = NETDEV_F_1TB_FD;
1605 netdev_dev->current = 0;
1608 if (ecmd.port == PORT_TP) {
1609 netdev_dev->current |= NETDEV_F_COPPER;
1610 } else if (ecmd.port == PORT_FIBRE) {
1611 netdev_dev->current |= NETDEV_F_FIBER;
1615 netdev_dev->current |= NETDEV_F_AUTONEG;
1618 /* Peer advertisements. */
1619 netdev_dev->peer = 0; /* XXX */
1622 netdev_dev->cache_valid |= VALID_FEATURES;
1623 netdev_dev->get_features_error = error;
1626 /* Stores the features supported by 'netdev' into each of '*current',
1627 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1628 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1631 netdev_linux_get_features(const struct netdev *netdev_,
1632 enum netdev_features *current,
1633 enum netdev_features *advertised,
1634 enum netdev_features *supported,
1635 enum netdev_features *peer)
1637 struct netdev_dev_linux *netdev_dev =
1638 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1640 netdev_linux_read_features(netdev_dev);
1642 if (!netdev_dev->get_features_error) {
1643 *current = netdev_dev->current;
1644 *advertised = netdev_dev->advertised;
1645 *supported = netdev_dev->supported;
1646 *peer = netdev_dev->peer;
1648 return netdev_dev->get_features_error;
1651 /* Set the features advertised by 'netdev' to 'advertise'. */
1653 netdev_linux_set_advertisements(struct netdev *netdev,
1654 enum netdev_features advertise)
1656 struct ethtool_cmd ecmd;
1659 COVERAGE_INC(netdev_get_ethtool);
1660 memset(&ecmd, 0, sizeof ecmd);
1661 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1662 ETHTOOL_GSET, "ETHTOOL_GSET");
1667 ecmd.advertising = 0;
1668 if (advertise & NETDEV_F_10MB_HD) {
1669 ecmd.advertising |= ADVERTISED_10baseT_Half;
1671 if (advertise & NETDEV_F_10MB_FD) {
1672 ecmd.advertising |= ADVERTISED_10baseT_Full;
1674 if (advertise & NETDEV_F_100MB_HD) {
1675 ecmd.advertising |= ADVERTISED_100baseT_Half;
1677 if (advertise & NETDEV_F_100MB_FD) {
1678 ecmd.advertising |= ADVERTISED_100baseT_Full;
1680 if (advertise & NETDEV_F_1GB_HD) {
1681 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1683 if (advertise & NETDEV_F_1GB_FD) {
1684 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1686 if (advertise & NETDEV_F_10GB_FD) {
1687 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1689 if (advertise & NETDEV_F_COPPER) {
1690 ecmd.advertising |= ADVERTISED_TP;
1692 if (advertise & NETDEV_F_FIBER) {
1693 ecmd.advertising |= ADVERTISED_FIBRE;
1695 if (advertise & NETDEV_F_AUTONEG) {
1696 ecmd.advertising |= ADVERTISED_Autoneg;
1698 if (advertise & NETDEV_F_PAUSE) {
1699 ecmd.advertising |= ADVERTISED_Pause;
1701 if (advertise & NETDEV_F_PAUSE_ASYM) {
1702 ecmd.advertising |= ADVERTISED_Asym_Pause;
1704 COVERAGE_INC(netdev_set_ethtool);
1705 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1706 ETHTOOL_SSET, "ETHTOOL_SSET");
1709 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1710 * successful, otherwise a positive errno value. */
1712 netdev_linux_set_policing(struct netdev *netdev,
1713 uint32_t kbits_rate, uint32_t kbits_burst)
1715 struct netdev_dev_linux *netdev_dev =
1716 netdev_dev_linux_cast(netdev_get_dev(netdev));
1717 const char *netdev_name = netdev_get_name(netdev);
1721 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1722 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1723 : kbits_burst); /* Stick with user-specified value. */
1725 if (netdev_dev->cache_valid & VALID_POLICING) {
1726 if (netdev_dev->netdev_policing_error) {
1727 return netdev_dev->netdev_policing_error;
1730 if (netdev_dev->kbits_rate == kbits_rate &&
1731 netdev_dev->kbits_burst == kbits_burst) {
1732 /* Assume that settings haven't changed since we last set them. */
1735 netdev_dev->cache_valid &= ~VALID_POLICING;
1738 COVERAGE_INC(netdev_set_policing);
1739 /* Remove any existing ingress qdisc. */
1740 error = tc_add_del_ingress_qdisc(netdev, false);
1742 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1743 netdev_name, strerror(error));
1748 error = tc_add_del_ingress_qdisc(netdev, true);
1750 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1751 netdev_name, strerror(error));
1755 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1757 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1758 netdev_name, strerror(error));
1763 netdev_dev->kbits_rate = kbits_rate;
1764 netdev_dev->kbits_burst = kbits_burst;
1767 if (!error || error == ENODEV) {
1768 netdev_dev->netdev_policing_error = error;
1769 netdev_dev->cache_valid |= VALID_POLICING;
1775 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1778 const struct tc_ops **opsp;
1780 for (opsp = tcs; *opsp != NULL; opsp++) {
1781 const struct tc_ops *ops = *opsp;
1782 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1783 sset_add(types, ops->ovs_name);
1789 static const struct tc_ops *
1790 tc_lookup_ovs_name(const char *name)
1792 const struct tc_ops **opsp;
1794 for (opsp = tcs; *opsp != NULL; opsp++) {
1795 const struct tc_ops *ops = *opsp;
1796 if (!strcmp(name, ops->ovs_name)) {
1803 static const struct tc_ops *
1804 tc_lookup_linux_name(const char *name)
1806 const struct tc_ops **opsp;
1808 for (opsp = tcs; *opsp != NULL; opsp++) {
1809 const struct tc_ops *ops = *opsp;
1810 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1817 static struct tc_queue *
1818 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1821 struct netdev_dev_linux *netdev_dev =
1822 netdev_dev_linux_cast(netdev_get_dev(netdev));
1823 struct tc_queue *queue;
1825 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1826 if (queue->queue_id == queue_id) {
1833 static struct tc_queue *
1834 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1836 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1840 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1842 struct netdev_qos_capabilities *caps)
1844 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1848 caps->n_queues = ops->n_queues;
1853 netdev_linux_get_qos(const struct netdev *netdev,
1854 const char **typep, struct smap *details)
1856 struct netdev_dev_linux *netdev_dev =
1857 netdev_dev_linux_cast(netdev_get_dev(netdev));
1860 error = tc_query_qdisc(netdev);
1865 *typep = netdev_dev->tc->ops->ovs_name;
1866 return (netdev_dev->tc->ops->qdisc_get
1867 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1872 netdev_linux_set_qos(struct netdev *netdev,
1873 const char *type, const struct smap *details)
1875 struct netdev_dev_linux *netdev_dev =
1876 netdev_dev_linux_cast(netdev_get_dev(netdev));
1877 const struct tc_ops *new_ops;
1880 new_ops = tc_lookup_ovs_name(type);
1881 if (!new_ops || !new_ops->tc_install) {
1885 error = tc_query_qdisc(netdev);
1890 if (new_ops == netdev_dev->tc->ops) {
1891 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1893 /* Delete existing qdisc. */
1894 error = tc_del_qdisc(netdev);
1898 assert(netdev_dev->tc == NULL);
1900 /* Install new qdisc. */
1901 error = new_ops->tc_install(netdev, details);
1902 assert((error == 0) == (netdev_dev->tc != NULL));
1909 netdev_linux_get_queue(const struct netdev *netdev,
1910 unsigned int queue_id, struct smap *details)
1912 struct netdev_dev_linux *netdev_dev =
1913 netdev_dev_linux_cast(netdev_get_dev(netdev));
1916 error = tc_query_qdisc(netdev);
1920 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1922 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1928 netdev_linux_set_queue(struct netdev *netdev,
1929 unsigned int queue_id, const struct smap *details)
1931 struct netdev_dev_linux *netdev_dev =
1932 netdev_dev_linux_cast(netdev_get_dev(netdev));
1935 error = tc_query_qdisc(netdev);
1938 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1939 || !netdev_dev->tc->ops->class_set) {
1943 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1947 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1949 struct netdev_dev_linux *netdev_dev =
1950 netdev_dev_linux_cast(netdev_get_dev(netdev));
1953 error = tc_query_qdisc(netdev);
1956 } else if (!netdev_dev->tc->ops->class_delete) {
1959 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1961 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1967 netdev_linux_get_queue_stats(const struct netdev *netdev,
1968 unsigned int queue_id,
1969 struct netdev_queue_stats *stats)
1971 struct netdev_dev_linux *netdev_dev =
1972 netdev_dev_linux_cast(netdev_get_dev(netdev));
1975 error = tc_query_qdisc(netdev);
1978 } else if (!netdev_dev->tc->ops->class_get_stats) {
1981 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1983 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1989 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1991 struct ofpbuf request;
1992 struct tcmsg *tcmsg;
1994 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1998 tcmsg->tcm_parent = 0;
1999 nl_dump_start(dump, rtnl_sock, &request);
2000 ofpbuf_uninit(&request);
2005 netdev_linux_dump_queues(const struct netdev *netdev,
2006 netdev_dump_queues_cb *cb, void *aux)
2008 struct netdev_dev_linux *netdev_dev =
2009 netdev_dev_linux_cast(netdev_get_dev(netdev));
2010 struct tc_queue *queue, *next_queue;
2011 struct smap details;
2015 error = tc_query_qdisc(netdev);
2018 } else if (!netdev_dev->tc->ops->class_get) {
2023 smap_init(&details);
2024 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2025 &netdev_dev->tc->queues) {
2026 smap_clear(&details);
2028 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2030 (*cb)(queue->queue_id, &details, aux);
2035 smap_destroy(&details);
2041 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2042 netdev_dump_queue_stats_cb *cb, void *aux)
2044 struct netdev_dev_linux *netdev_dev =
2045 netdev_dev_linux_cast(netdev_get_dev(netdev));
2046 struct nl_dump dump;
2051 error = tc_query_qdisc(netdev);
2054 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2059 if (!start_queue_dump(netdev, &dump)) {
2062 while (nl_dump_next(&dump, &msg)) {
2063 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2069 error = nl_dump_done(&dump);
2070 return error ? error : last_error;
2074 netdev_linux_get_in4(const struct netdev *netdev_,
2075 struct in_addr *address, struct in_addr *netmask)
2077 struct netdev_dev_linux *netdev_dev =
2078 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2080 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2083 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2084 SIOCGIFADDR, "SIOCGIFADDR");
2089 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2090 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2095 netdev_dev->cache_valid |= VALID_IN4;
2097 *address = netdev_dev->address;
2098 *netmask = netdev_dev->netmask;
2099 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2103 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2104 struct in_addr netmask)
2106 struct netdev_dev_linux *netdev_dev =
2107 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2110 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2112 netdev_dev->cache_valid |= VALID_IN4;
2113 netdev_dev->address = address;
2114 netdev_dev->netmask = netmask;
2115 if (address.s_addr != INADDR_ANY) {
2116 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2117 "SIOCSIFNETMASK", netmask);
2124 parse_if_inet6_line(const char *line,
2125 struct in6_addr *in6, char ifname[16 + 1])
2127 uint8_t *s6 = in6->s6_addr;
2128 #define X8 "%2"SCNx8
2130 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2131 "%*x %*x %*x %*x %16s\n",
2132 &s6[0], &s6[1], &s6[2], &s6[3],
2133 &s6[4], &s6[5], &s6[6], &s6[7],
2134 &s6[8], &s6[9], &s6[10], &s6[11],
2135 &s6[12], &s6[13], &s6[14], &s6[15],
2139 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2140 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2142 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2144 struct netdev_dev_linux *netdev_dev =
2145 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2146 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2150 netdev_dev->in6 = in6addr_any;
2152 file = fopen("/proc/net/if_inet6", "r");
2154 const char *name = netdev_get_name(netdev_);
2155 while (fgets(line, sizeof line, file)) {
2156 struct in6_addr in6_tmp;
2157 char ifname[16 + 1];
2158 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2159 && !strcmp(name, ifname))
2161 netdev_dev->in6 = in6_tmp;
2167 netdev_dev->cache_valid |= VALID_IN6;
2169 *in6 = netdev_dev->in6;
2174 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2176 struct sockaddr_in sin;
2177 memset(&sin, 0, sizeof sin);
2178 sin.sin_family = AF_INET;
2179 sin.sin_addr = addr;
2182 memset(sa, 0, sizeof *sa);
2183 memcpy(sa, &sin, sizeof sin);
2187 do_set_addr(struct netdev *netdev,
2188 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2191 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2192 make_in4_sockaddr(&ifr.ifr_addr, addr);
2194 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2198 /* Adds 'router' as a default IP gateway. */
2200 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2202 struct in_addr any = { INADDR_ANY };
2206 memset(&rt, 0, sizeof rt);
2207 make_in4_sockaddr(&rt.rt_dst, any);
2208 make_in4_sockaddr(&rt.rt_gateway, router);
2209 make_in4_sockaddr(&rt.rt_genmask, any);
2210 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2211 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2213 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2219 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2222 static const char fn[] = "/proc/net/route";
2227 *netdev_name = NULL;
2228 stream = fopen(fn, "r");
2229 if (stream == NULL) {
2230 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2235 while (fgets(line, sizeof line, stream)) {
2238 ovs_be32 dest, gateway, mask;
2239 int refcnt, metric, mtu;
2240 unsigned int flags, use, window, irtt;
2243 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2245 iface, &dest, &gateway, &flags, &refcnt,
2246 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2248 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2252 if (!(flags & RTF_UP)) {
2253 /* Skip routes that aren't up. */
2257 /* The output of 'dest', 'mask', and 'gateway' were given in
2258 * network byte order, so we don't need need any endian
2259 * conversions here. */
2260 if ((dest & mask) == (host->s_addr & mask)) {
2262 /* The host is directly reachable. */
2263 next_hop->s_addr = 0;
2265 /* To reach the host, we must go through a gateway. */
2266 next_hop->s_addr = gateway;
2268 *netdev_name = xstrdup(iface);
2280 netdev_linux_get_drv_info(const struct netdev *netdev, struct smap *smap)
2283 struct netdev_dev_linux *netdev_dev =
2284 netdev_dev_linux_cast(netdev_get_dev(netdev));
2286 error = netdev_linux_get_drvinfo(netdev_dev);
2288 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2289 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2290 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
2296 netdev_internal_get_drv_info(const struct netdev *netdev OVS_UNUSED,
2299 smap_add(smap, "driver_name", "openvswitch");
2303 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2304 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2305 * returns 0. Otherwise, it returns a positive errno value; in particular,
2306 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2308 netdev_linux_arp_lookup(const struct netdev *netdev,
2309 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2312 struct sockaddr_in sin;
2315 memset(&r, 0, sizeof r);
2316 memset(&sin, 0, sizeof sin);
2317 sin.sin_family = AF_INET;
2318 sin.sin_addr.s_addr = ip;
2320 memcpy(&r.arp_pa, &sin, sizeof sin);
2321 r.arp_ha.sa_family = ARPHRD_ETHER;
2323 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2324 COVERAGE_INC(netdev_arp_lookup);
2325 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2327 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2328 } else if (retval != ENXIO) {
2329 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2330 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2336 nd_to_iff_flags(enum netdev_flags nd)
2339 if (nd & NETDEV_UP) {
2342 if (nd & NETDEV_PROMISC) {
2349 iff_to_nd_flags(int iff)
2351 enum netdev_flags nd = 0;
2355 if (iff & IFF_PROMISC) {
2356 nd |= NETDEV_PROMISC;
2362 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2363 enum netdev_flags on, enum netdev_flags *old_flagsp)
2365 struct netdev_dev_linux *netdev_dev;
2366 int old_flags, new_flags;
2369 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2370 old_flags = netdev_dev->ifi_flags;
2371 *old_flagsp = iff_to_nd_flags(old_flags);
2372 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2373 if (new_flags != old_flags) {
2374 error = set_flags(netdev, new_flags);
2375 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2381 netdev_linux_change_seq(const struct netdev *netdev)
2383 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2386 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2387 GET_FEATURES, GET_STATUS) \
2391 netdev_linux_init, \
2393 netdev_linux_wait, \
2396 netdev_linux_destroy, \
2397 NULL, /* get_config */ \
2398 NULL, /* set_config */ \
2400 netdev_linux_open, \
2401 netdev_linux_close, \
2403 netdev_linux_listen, \
2404 netdev_linux_recv, \
2405 netdev_linux_recv_wait, \
2406 netdev_linux_drain, \
2408 netdev_linux_send, \
2409 netdev_linux_send_wait, \
2411 netdev_linux_set_etheraddr, \
2412 netdev_linux_get_etheraddr, \
2413 netdev_linux_get_mtu, \
2414 netdev_linux_set_mtu, \
2415 netdev_linux_get_ifindex, \
2416 netdev_linux_get_carrier, \
2417 netdev_linux_get_carrier_resets, \
2418 netdev_linux_set_miimon_interval, \
2423 netdev_linux_set_advertisements, \
2425 netdev_linux_set_policing, \
2426 netdev_linux_get_qos_types, \
2427 netdev_linux_get_qos_capabilities, \
2428 netdev_linux_get_qos, \
2429 netdev_linux_set_qos, \
2430 netdev_linux_get_queue, \
2431 netdev_linux_set_queue, \
2432 netdev_linux_delete_queue, \
2433 netdev_linux_get_queue_stats, \
2434 netdev_linux_dump_queues, \
2435 netdev_linux_dump_queue_stats, \
2437 netdev_linux_get_in4, \
2438 netdev_linux_set_in4, \
2439 netdev_linux_get_in6, \
2440 netdev_linux_add_router, \
2441 netdev_linux_get_next_hop, \
2443 netdev_linux_arp_lookup, \
2445 netdev_linux_update_flags, \
2447 netdev_linux_change_seq \
2450 const struct netdev_class netdev_linux_class =
2453 netdev_linux_create,
2454 netdev_linux_get_stats,
2455 NULL, /* set_stats */
2456 netdev_linux_get_features,
2457 netdev_linux_get_drv_info);
2459 const struct netdev_class netdev_tap_class =
2462 netdev_linux_create_tap,
2463 netdev_tap_get_stats,
2464 NULL, /* set_stats */
2465 netdev_linux_get_features,
2466 netdev_linux_get_drv_info);
2468 const struct netdev_class netdev_internal_class =
2471 netdev_linux_create,
2472 netdev_internal_get_stats,
2473 netdev_vport_set_stats,
2474 NULL, /* get_features */
2475 netdev_internal_get_drv_info);
2477 /* HTB traffic control class. */
2479 #define HTB_N_QUEUES 0xf000
2483 unsigned int max_rate; /* In bytes/s. */
2487 struct tc_queue tc_queue;
2488 unsigned int min_rate; /* In bytes/s. */
2489 unsigned int max_rate; /* In bytes/s. */
2490 unsigned int burst; /* In bytes. */
2491 unsigned int priority; /* Lower values are higher priorities. */
2495 htb_get__(const struct netdev *netdev)
2497 struct netdev_dev_linux *netdev_dev =
2498 netdev_dev_linux_cast(netdev_get_dev(netdev));
2499 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2503 htb_install__(struct netdev *netdev, uint64_t max_rate)
2505 struct netdev_dev_linux *netdev_dev =
2506 netdev_dev_linux_cast(netdev_get_dev(netdev));
2509 htb = xmalloc(sizeof *htb);
2510 tc_init(&htb->tc, &tc_ops_htb);
2511 htb->max_rate = max_rate;
2513 netdev_dev->tc = &htb->tc;
2516 /* Create an HTB qdisc.
2518 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2520 htb_setup_qdisc__(struct netdev *netdev)
2523 struct tc_htb_glob opt;
2524 struct ofpbuf request;
2525 struct tcmsg *tcmsg;
2527 tc_del_qdisc(netdev);
2529 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2530 NLM_F_EXCL | NLM_F_CREATE, &request);
2534 tcmsg->tcm_handle = tc_make_handle(1, 0);
2535 tcmsg->tcm_parent = TC_H_ROOT;
2537 nl_msg_put_string(&request, TCA_KIND, "htb");
2539 memset(&opt, 0, sizeof opt);
2540 opt.rate2quantum = 10;
2544 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2545 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2546 nl_msg_end_nested(&request, opt_offset);
2548 return tc_transact(&request, NULL);
2551 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2552 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2554 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2555 unsigned int parent, struct htb_class *class)
2558 struct tc_htb_opt opt;
2559 struct ofpbuf request;
2560 struct tcmsg *tcmsg;
2564 error = netdev_get_mtu(netdev, &mtu);
2566 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2567 netdev_get_name(netdev));
2571 memset(&opt, 0, sizeof opt);
2572 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2573 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2574 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2575 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2576 opt.prio = class->priority;
2578 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2582 tcmsg->tcm_handle = handle;
2583 tcmsg->tcm_parent = parent;
2585 nl_msg_put_string(&request, TCA_KIND, "htb");
2586 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2587 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2588 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2589 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2590 nl_msg_end_nested(&request, opt_offset);
2592 error = tc_transact(&request, NULL);
2594 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2595 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2596 netdev_get_name(netdev),
2597 tc_get_major(handle), tc_get_minor(handle),
2598 tc_get_major(parent), tc_get_minor(parent),
2599 class->min_rate, class->max_rate,
2600 class->burst, class->priority, strerror(error));
2605 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2606 * description of them into 'details'. The description complies with the
2607 * specification given in the vswitch database documentation for linux-htb
2610 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2612 static const struct nl_policy tca_htb_policy[] = {
2613 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2614 .min_len = sizeof(struct tc_htb_opt) },
2617 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2618 const struct tc_htb_opt *htb;
2620 if (!nl_parse_nested(nl_options, tca_htb_policy,
2621 attrs, ARRAY_SIZE(tca_htb_policy))) {
2622 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2626 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2627 class->min_rate = htb->rate.rate;
2628 class->max_rate = htb->ceil.rate;
2629 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2630 class->priority = htb->prio;
2635 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2636 struct htb_class *options,
2637 struct netdev_queue_stats *stats)
2639 struct nlattr *nl_options;
2640 unsigned int handle;
2643 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2644 if (!error && queue_id) {
2645 unsigned int major = tc_get_major(handle);
2646 unsigned int minor = tc_get_minor(handle);
2647 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2648 *queue_id = minor - 1;
2653 if (!error && options) {
2654 error = htb_parse_tca_options__(nl_options, options);
2660 htb_parse_qdisc_details__(struct netdev *netdev,
2661 const struct smap *details, struct htb_class *hc)
2663 const char *max_rate_s;
2665 max_rate_s = smap_get(details, "max-rate");
2666 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2667 if (!hc->max_rate) {
2668 enum netdev_features current;
2670 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2671 hc->max_rate = netdev_features_to_bps(current) / 8;
2673 hc->min_rate = hc->max_rate;
2679 htb_parse_class_details__(struct netdev *netdev,
2680 const struct smap *details, struct htb_class *hc)
2682 const struct htb *htb = htb_get__(netdev);
2683 const char *min_rate_s = smap_get(details, "min-rate");
2684 const char *max_rate_s = smap_get(details, "max-rate");
2685 const char *burst_s = smap_get(details, "burst");
2686 const char *priority_s = smap_get(details, "priority");
2689 error = netdev_get_mtu(netdev, &mtu);
2691 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2692 netdev_get_name(netdev));
2696 /* HTB requires at least an mtu sized min-rate to send any traffic even
2697 * on uncongested links. */
2698 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2699 hc->min_rate = MAX(hc->min_rate, mtu);
2700 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2703 hc->max_rate = (max_rate_s
2704 ? strtoull(max_rate_s, NULL, 10) / 8
2706 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2707 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2711 * According to hints in the documentation that I've read, it is important
2712 * that 'burst' be at least as big as the largest frame that might be
2713 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2714 * but having it a bit too small is a problem. Since netdev_get_mtu()
2715 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2716 * the MTU. We actually add 64, instead of 14, as a guard against
2717 * additional headers get tacked on somewhere that we're not aware of. */
2718 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2719 hc->burst = MAX(hc->burst, mtu + 64);
2722 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2728 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2729 unsigned int parent, struct htb_class *options,
2730 struct netdev_queue_stats *stats)
2732 struct ofpbuf *reply;
2735 error = tc_query_class(netdev, handle, parent, &reply);
2737 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2738 ofpbuf_delete(reply);
2744 htb_tc_install(struct netdev *netdev, const struct smap *details)
2748 error = htb_setup_qdisc__(netdev);
2750 struct htb_class hc;
2752 htb_parse_qdisc_details__(netdev, details, &hc);
2753 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2754 tc_make_handle(1, 0), &hc);
2756 htb_install__(netdev, hc.max_rate);
2762 static struct htb_class *
2763 htb_class_cast__(const struct tc_queue *queue)
2765 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2769 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2770 const struct htb_class *hc)
2772 struct htb *htb = htb_get__(netdev);
2773 size_t hash = hash_int(queue_id, 0);
2774 struct tc_queue *queue;
2775 struct htb_class *hcp;
2777 queue = tc_find_queue__(netdev, queue_id, hash);
2779 hcp = htb_class_cast__(queue);
2781 hcp = xmalloc(sizeof *hcp);
2782 queue = &hcp->tc_queue;
2783 queue->queue_id = queue_id;
2784 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2787 hcp->min_rate = hc->min_rate;
2788 hcp->max_rate = hc->max_rate;
2789 hcp->burst = hc->burst;
2790 hcp->priority = hc->priority;
2794 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2797 struct nl_dump dump;
2798 struct htb_class hc;
2800 /* Get qdisc options. */
2802 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2803 htb_install__(netdev, hc.max_rate);
2806 if (!start_queue_dump(netdev, &dump)) {
2809 while (nl_dump_next(&dump, &msg)) {
2810 unsigned int queue_id;
2812 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2813 htb_update_queue__(netdev, queue_id, &hc);
2816 nl_dump_done(&dump);
2822 htb_tc_destroy(struct tc *tc)
2824 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2825 struct htb_class *hc, *next;
2827 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2828 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2836 htb_qdisc_get(const struct netdev *netdev, struct smap *details)
2838 const struct htb *htb = htb_get__(netdev);
2839 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
2844 htb_qdisc_set(struct netdev *netdev, const struct smap *details)
2846 struct htb_class hc;
2849 htb_parse_qdisc_details__(netdev, details, &hc);
2850 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2851 tc_make_handle(1, 0), &hc);
2853 htb_get__(netdev)->max_rate = hc.max_rate;
2859 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2860 const struct tc_queue *queue, struct smap *details)
2862 const struct htb_class *hc = htb_class_cast__(queue);
2864 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
2865 if (hc->min_rate != hc->max_rate) {
2866 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
2868 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
2870 smap_add_format(details, "priority", "%u", hc->priority);
2876 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2877 const struct smap *details)
2879 struct htb_class hc;
2882 error = htb_parse_class_details__(netdev, details, &hc);
2887 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2888 tc_make_handle(1, 0xfffe), &hc);
2893 htb_update_queue__(netdev, queue_id, &hc);
2898 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2900 struct htb_class *hc = htb_class_cast__(queue);
2901 struct htb *htb = htb_get__(netdev);
2904 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2906 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2913 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2914 struct netdev_queue_stats *stats)
2916 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2917 tc_make_handle(1, 0xfffe), NULL, stats);
2921 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2922 const struct ofpbuf *nlmsg,
2923 netdev_dump_queue_stats_cb *cb, void *aux)
2925 struct netdev_queue_stats stats;
2926 unsigned int handle, major, minor;
2929 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2934 major = tc_get_major(handle);
2935 minor = tc_get_minor(handle);
2936 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2937 (*cb)(minor - 1, &stats, aux);
2942 static const struct tc_ops tc_ops_htb = {
2943 "htb", /* linux_name */
2944 "linux-htb", /* ovs_name */
2945 HTB_N_QUEUES, /* n_queues */
2954 htb_class_get_stats,
2955 htb_class_dump_stats
2958 /* "linux-hfsc" traffic control class. */
2960 #define HFSC_N_QUEUES 0xf000
2968 struct tc_queue tc_queue;
2973 static struct hfsc *
2974 hfsc_get__(const struct netdev *netdev)
2976 struct netdev_dev_linux *netdev_dev;
2977 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2978 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2981 static struct hfsc_class *
2982 hfsc_class_cast__(const struct tc_queue *queue)
2984 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2988 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2990 struct netdev_dev_linux * netdev_dev;
2993 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2994 hfsc = xmalloc(sizeof *hfsc);
2995 tc_init(&hfsc->tc, &tc_ops_hfsc);
2996 hfsc->max_rate = max_rate;
2997 netdev_dev->tc = &hfsc->tc;
3001 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3002 const struct hfsc_class *hc)
3006 struct hfsc_class *hcp;
3007 struct tc_queue *queue;
3009 hfsc = hfsc_get__(netdev);
3010 hash = hash_int(queue_id, 0);
3012 queue = tc_find_queue__(netdev, queue_id, hash);
3014 hcp = hfsc_class_cast__(queue);
3016 hcp = xmalloc(sizeof *hcp);
3017 queue = &hcp->tc_queue;
3018 queue->queue_id = queue_id;
3019 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3022 hcp->min_rate = hc->min_rate;
3023 hcp->max_rate = hc->max_rate;
3027 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3029 const struct tc_service_curve *rsc, *fsc, *usc;
3030 static const struct nl_policy tca_hfsc_policy[] = {
3032 .type = NL_A_UNSPEC,
3034 .min_len = sizeof(struct tc_service_curve),
3037 .type = NL_A_UNSPEC,
3039 .min_len = sizeof(struct tc_service_curve),
3042 .type = NL_A_UNSPEC,
3044 .min_len = sizeof(struct tc_service_curve),
3047 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3049 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3050 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3051 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3055 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3056 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3057 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3059 if (rsc->m1 != 0 || rsc->d != 0 ||
3060 fsc->m1 != 0 || fsc->d != 0 ||
3061 usc->m1 != 0 || usc->d != 0) {
3062 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3063 "Non-linear service curves are not supported.");
3067 if (rsc->m2 != fsc->m2) {
3068 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3069 "Real-time service curves are not supported ");
3073 if (rsc->m2 > usc->m2) {
3074 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3075 "Min-rate service curve is greater than "
3076 "the max-rate service curve.");
3080 class->min_rate = fsc->m2;
3081 class->max_rate = usc->m2;
3086 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3087 struct hfsc_class *options,
3088 struct netdev_queue_stats *stats)
3091 unsigned int handle;
3092 struct nlattr *nl_options;
3094 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3100 unsigned int major, minor;
3102 major = tc_get_major(handle);
3103 minor = tc_get_minor(handle);
3104 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3105 *queue_id = minor - 1;
3112 error = hfsc_parse_tca_options__(nl_options, options);
3119 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3120 unsigned int parent, struct hfsc_class *options,
3121 struct netdev_queue_stats *stats)
3124 struct ofpbuf *reply;
3126 error = tc_query_class(netdev, handle, parent, &reply);
3131 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3132 ofpbuf_delete(reply);
3137 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
3138 struct hfsc_class *class)
3141 const char *max_rate_s;
3143 max_rate_s = smap_get(details, "max-rate");
3144 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3147 enum netdev_features current;
3149 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3150 max_rate = netdev_features_to_bps(current) / 8;
3153 class->min_rate = max_rate;
3154 class->max_rate = max_rate;
3158 hfsc_parse_class_details__(struct netdev *netdev,
3159 const struct smap *details,
3160 struct hfsc_class * class)
3162 const struct hfsc *hfsc;
3163 uint32_t min_rate, max_rate;
3164 const char *min_rate_s, *max_rate_s;
3166 hfsc = hfsc_get__(netdev);
3167 min_rate_s = smap_get(details, "min-rate");
3168 max_rate_s = smap_get(details, "max-rate");
3170 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3171 min_rate = MAX(min_rate, 1);
3172 min_rate = MIN(min_rate, hfsc->max_rate);
3174 max_rate = (max_rate_s
3175 ? strtoull(max_rate_s, NULL, 10) / 8
3177 max_rate = MAX(max_rate, min_rate);
3178 max_rate = MIN(max_rate, hfsc->max_rate);
3180 class->min_rate = min_rate;
3181 class->max_rate = max_rate;
3186 /* Create an HFSC qdisc.
3188 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3190 hfsc_setup_qdisc__(struct netdev * netdev)
3192 struct tcmsg *tcmsg;
3193 struct ofpbuf request;
3194 struct tc_hfsc_qopt opt;
3196 tc_del_qdisc(netdev);
3198 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3199 NLM_F_EXCL | NLM_F_CREATE, &request);
3205 tcmsg->tcm_handle = tc_make_handle(1, 0);
3206 tcmsg->tcm_parent = TC_H_ROOT;
3208 memset(&opt, 0, sizeof opt);
3211 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3212 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3214 return tc_transact(&request, NULL);
3217 /* Create an HFSC class.
3219 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3220 * sc rate <min_rate> ul rate <max_rate>" */
3222 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3223 unsigned int parent, struct hfsc_class *class)
3227 struct tcmsg *tcmsg;
3228 struct ofpbuf request;
3229 struct tc_service_curve min, max;
3231 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3237 tcmsg->tcm_handle = handle;
3238 tcmsg->tcm_parent = parent;
3242 min.m2 = class->min_rate;
3246 max.m2 = class->max_rate;
3248 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3249 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3250 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3251 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3252 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3253 nl_msg_end_nested(&request, opt_offset);
3255 error = tc_transact(&request, NULL);
3257 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3258 "min-rate %ubps, max-rate %ubps (%s)",
3259 netdev_get_name(netdev),
3260 tc_get_major(handle), tc_get_minor(handle),
3261 tc_get_major(parent), tc_get_minor(parent),
3262 class->min_rate, class->max_rate, strerror(error));
3269 hfsc_tc_install(struct netdev *netdev, const struct smap *details)
3272 struct hfsc_class class;
3274 error = hfsc_setup_qdisc__(netdev);
3280 hfsc_parse_qdisc_details__(netdev, details, &class);
3281 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3282 tc_make_handle(1, 0), &class);
3288 hfsc_install__(netdev, class.max_rate);
3293 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3296 struct nl_dump dump;
3297 struct hfsc_class hc;
3300 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3301 hfsc_install__(netdev, hc.max_rate);
3303 if (!start_queue_dump(netdev, &dump)) {
3307 while (nl_dump_next(&dump, &msg)) {
3308 unsigned int queue_id;
3310 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3311 hfsc_update_queue__(netdev, queue_id, &hc);
3315 nl_dump_done(&dump);
3320 hfsc_tc_destroy(struct tc *tc)
3323 struct hfsc_class *hc, *next;
3325 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3327 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3328 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3337 hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
3339 const struct hfsc *hfsc;
3340 hfsc = hfsc_get__(netdev);
3341 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
3346 hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
3349 struct hfsc_class class;
3351 hfsc_parse_qdisc_details__(netdev, details, &class);
3352 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3353 tc_make_handle(1, 0), &class);
3356 hfsc_get__(netdev)->max_rate = class.max_rate;
3363 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3364 const struct tc_queue *queue, struct smap *details)
3366 const struct hfsc_class *hc;
3368 hc = hfsc_class_cast__(queue);
3369 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
3370 if (hc->min_rate != hc->max_rate) {
3371 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
3377 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3378 const struct smap *details)
3381 struct hfsc_class class;
3383 error = hfsc_parse_class_details__(netdev, details, &class);
3388 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3389 tc_make_handle(1, 0xfffe), &class);
3394 hfsc_update_queue__(netdev, queue_id, &class);
3399 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3403 struct hfsc_class *hc;
3405 hc = hfsc_class_cast__(queue);
3406 hfsc = hfsc_get__(netdev);
3408 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3410 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3417 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3418 struct netdev_queue_stats *stats)
3420 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3421 tc_make_handle(1, 0xfffe), NULL, stats);
3425 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3426 const struct ofpbuf *nlmsg,
3427 netdev_dump_queue_stats_cb *cb, void *aux)
3429 struct netdev_queue_stats stats;
3430 unsigned int handle, major, minor;
3433 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3438 major = tc_get_major(handle);
3439 minor = tc_get_minor(handle);
3440 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3441 (*cb)(minor - 1, &stats, aux);
3446 static const struct tc_ops tc_ops_hfsc = {
3447 "hfsc", /* linux_name */
3448 "linux-hfsc", /* ovs_name */
3449 HFSC_N_QUEUES, /* n_queues */
3450 hfsc_tc_install, /* tc_install */
3451 hfsc_tc_load, /* tc_load */
3452 hfsc_tc_destroy, /* tc_destroy */
3453 hfsc_qdisc_get, /* qdisc_get */
3454 hfsc_qdisc_set, /* qdisc_set */
3455 hfsc_class_get, /* class_get */
3456 hfsc_class_set, /* class_set */
3457 hfsc_class_delete, /* class_delete */
3458 hfsc_class_get_stats, /* class_get_stats */
3459 hfsc_class_dump_stats /* class_dump_stats */
3462 /* "linux-default" traffic control class.
3464 * This class represents the default, unnamed Linux qdisc. It corresponds to
3465 * the "" (empty string) QoS type in the OVS database. */
3468 default_install__(struct netdev *netdev)
3470 struct netdev_dev_linux *netdev_dev =
3471 netdev_dev_linux_cast(netdev_get_dev(netdev));
3472 static struct tc *tc;
3475 tc = xmalloc(sizeof *tc);
3476 tc_init(tc, &tc_ops_default);
3478 netdev_dev->tc = tc;
3482 default_tc_install(struct netdev *netdev,
3483 const struct smap *details OVS_UNUSED)
3485 default_install__(netdev);
3490 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3492 default_install__(netdev);
3496 static const struct tc_ops tc_ops_default = {
3497 NULL, /* linux_name */
3502 NULL, /* tc_destroy */
3503 NULL, /* qdisc_get */
3504 NULL, /* qdisc_set */
3505 NULL, /* class_get */
3506 NULL, /* class_set */
3507 NULL, /* class_delete */
3508 NULL, /* class_get_stats */
3509 NULL /* class_dump_stats */
3512 /* "linux-other" traffic control class.
3517 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3519 struct netdev_dev_linux *netdev_dev =
3520 netdev_dev_linux_cast(netdev_get_dev(netdev));
3521 static struct tc *tc;
3524 tc = xmalloc(sizeof *tc);
3525 tc_init(tc, &tc_ops_other);
3527 netdev_dev->tc = tc;
3531 static const struct tc_ops tc_ops_other = {
3532 NULL, /* linux_name */
3533 "linux-other", /* ovs_name */
3535 NULL, /* tc_install */
3537 NULL, /* tc_destroy */
3538 NULL, /* qdisc_get */
3539 NULL, /* qdisc_set */
3540 NULL, /* class_get */
3541 NULL, /* class_set */
3542 NULL, /* class_delete */
3543 NULL, /* class_get_stats */
3544 NULL /* class_dump_stats */
3547 /* Traffic control. */
3549 /* Number of kernel "tc" ticks per second. */
3550 static double ticks_per_s;
3552 /* Number of kernel "jiffies" per second. This is used for the purpose of
3553 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3554 * one jiffy's worth of data.
3556 * There are two possibilities here:
3558 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3559 * approximate range of 100 to 1024. That means that we really need to
3560 * make sure that the qdisc can buffer that much data.
3562 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3563 * has finely granular timers and there's no need to fudge additional room
3564 * for buffers. (There's no extra effort needed to implement that: the
3565 * large 'buffer_hz' is used as a divisor, so practically any number will
3566 * come out as 0 in the division. Small integer results in the case of
3567 * really high dividends won't have any real effect anyhow.)
3569 static unsigned int buffer_hz;
3571 /* Returns tc handle 'major':'minor'. */
3573 tc_make_handle(unsigned int major, unsigned int minor)
3575 return TC_H_MAKE(major << 16, minor);
3578 /* Returns the major number from 'handle'. */
3580 tc_get_major(unsigned int handle)
3582 return TC_H_MAJ(handle) >> 16;
3585 /* Returns the minor number from 'handle'. */
3587 tc_get_minor(unsigned int handle)
3589 return TC_H_MIN(handle);
3592 static struct tcmsg *
3593 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3594 struct ofpbuf *request)
3596 struct tcmsg *tcmsg;
3600 error = get_ifindex(netdev, &ifindex);
3605 ofpbuf_init(request, 512);
3606 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3607 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3608 tcmsg->tcm_family = AF_UNSPEC;
3609 tcmsg->tcm_ifindex = ifindex;
3610 /* Caller should fill in tcmsg->tcm_handle. */
3611 /* Caller should fill in tcmsg->tcm_parent. */
3617 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3619 int error = nl_sock_transact(rtnl_sock, request, replyp);
3620 ofpbuf_uninit(request);
3624 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3625 * policing configuration.
3627 * This function is equivalent to running the following when 'add' is true:
3628 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3630 * This function is equivalent to running the following when 'add' is false:
3631 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3633 * The configuration and stats may be seen with the following command:
3634 * /sbin/tc -s qdisc show dev <devname>
3636 * Returns 0 if successful, otherwise a positive errno value.
3639 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3641 struct ofpbuf request;
3642 struct tcmsg *tcmsg;
3644 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3645 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3647 tcmsg = tc_make_request(netdev, type, flags, &request);
3651 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3652 tcmsg->tcm_parent = TC_H_INGRESS;
3653 nl_msg_put_string(&request, TCA_KIND, "ingress");
3654 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3656 error = tc_transact(&request, NULL);
3658 /* If we're deleting the qdisc, don't worry about some of the
3659 * error conditions. */
3660 if (!add && (error == ENOENT || error == EINVAL)) {
3669 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3672 * This function is equivalent to running:
3673 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3674 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3677 * The configuration and stats may be seen with the following command:
3678 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3680 * Returns 0 if successful, otherwise a positive errno value.
3683 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3685 struct tc_police tc_police;
3686 struct ofpbuf request;
3687 struct tcmsg *tcmsg;
3688 size_t basic_offset;
3689 size_t police_offset;
3693 memset(&tc_police, 0, sizeof tc_police);
3694 tc_police.action = TC_POLICE_SHOT;
3695 tc_police.mtu = mtu;
3696 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3697 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3698 kbits_burst * 1024);
3700 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3701 NLM_F_EXCL | NLM_F_CREATE, &request);
3705 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3706 tcmsg->tcm_info = tc_make_handle(49,
3707 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3709 nl_msg_put_string(&request, TCA_KIND, "basic");
3710 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3711 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3712 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3713 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3714 nl_msg_end_nested(&request, police_offset);
3715 nl_msg_end_nested(&request, basic_offset);
3717 error = tc_transact(&request, NULL);
3728 /* The values in psched are not individually very meaningful, but they are
3729 * important. The tables below show some values seen in the wild.
3733 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3734 * (Before that, there are hints that it was 1000000000.)
3736 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3740 * -----------------------------------
3741 * [1] 000c8000 000f4240 000f4240 00000064
3742 * [2] 000003e8 00000400 000f4240 3b9aca00
3743 * [3] 000003e8 00000400 000f4240 3b9aca00
3744 * [4] 000003e8 00000400 000f4240 00000064
3745 * [5] 000003e8 00000040 000f4240 3b9aca00
3746 * [6] 000003e8 00000040 000f4240 000000f9
3748 * a b c d ticks_per_s buffer_hz
3749 * ------- --------- ---------- ------------- ----------- -------------
3750 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3751 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3752 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3753 * [4] 1,000 1,024 1,000,000 100 976,562 100
3754 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3755 * [6] 1,000 64 1,000,000 249 15,625,000 249
3757 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3758 * [2] 2.6.26-1-686-bigmem from Debian lenny
3759 * [3] 2.6.26-2-sparc64 from Debian lenny
3760 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3761 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3762 * [6] 2.6.34 from kernel.org on KVM
3764 static const char fn[] = "/proc/net/psched";
3765 unsigned int a, b, c, d;
3771 stream = fopen(fn, "r");
3773 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3777 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3778 VLOG_WARN("%s: read failed", fn);
3782 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3786 VLOG_WARN("%s: invalid scheduler parameters", fn);
3790 ticks_per_s = (double) a * c / b;
3794 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3797 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3800 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3801 * rate of 'rate' bytes per second. */
3803 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3808 return (rate * ticks) / ticks_per_s;
3811 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3812 * rate of 'rate' bytes per second. */
3814 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3819 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3822 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3823 * a transmission rate of 'rate' bytes per second. */
3825 tc_buffer_per_jiffy(unsigned int rate)
3830 return rate / buffer_hz;
3833 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3834 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3835 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3836 * stores NULL into it if it is absent.
3838 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3841 * Returns 0 if successful, otherwise a positive errno value. */
3843 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3844 struct nlattr **options)
3846 static const struct nl_policy tca_policy[] = {
3847 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3848 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3850 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3852 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3853 tca_policy, ta, ARRAY_SIZE(ta))) {
3854 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3859 *kind = nl_attr_get_string(ta[TCA_KIND]);
3863 *options = ta[TCA_OPTIONS];
3878 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3879 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3880 * into '*options', and its queue statistics into '*stats'. Any of the output
3881 * arguments may be null.
3883 * Returns 0 if successful, otherwise a positive errno value. */
3885 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3886 struct nlattr **options, struct netdev_queue_stats *stats)
3888 static const struct nl_policy tca_policy[] = {
3889 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3890 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3892 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3894 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3895 tca_policy, ta, ARRAY_SIZE(ta))) {
3896 VLOG_WARN_RL(&rl, "failed to parse class message");
3901 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3902 *handlep = tc->tcm_handle;
3906 *options = ta[TCA_OPTIONS];
3910 const struct gnet_stats_queue *gsq;
3911 struct gnet_stats_basic gsb;
3913 static const struct nl_policy stats_policy[] = {
3914 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3915 .min_len = sizeof gsb },
3916 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3917 .min_len = sizeof *gsq },
3919 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3921 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3922 sa, ARRAY_SIZE(sa))) {
3923 VLOG_WARN_RL(&rl, "failed to parse class stats");
3927 /* Alignment issues screw up the length of struct gnet_stats_basic on
3928 * some arch/bitsize combinations. Newer versions of Linux have a
3929 * struct gnet_stats_basic_packed, but we can't depend on that. The
3930 * easiest thing to do is just to make a copy. */
3931 memset(&gsb, 0, sizeof gsb);
3932 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3933 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3934 stats->tx_bytes = gsb.bytes;
3935 stats->tx_packets = gsb.packets;
3937 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3938 stats->tx_errors = gsq->drops;
3948 memset(stats, 0, sizeof *stats);
3953 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3956 tc_query_class(const struct netdev *netdev,
3957 unsigned int handle, unsigned int parent,
3958 struct ofpbuf **replyp)
3960 struct ofpbuf request;
3961 struct tcmsg *tcmsg;
3964 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3968 tcmsg->tcm_handle = handle;
3969 tcmsg->tcm_parent = parent;
3971 error = tc_transact(&request, replyp);
3973 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3974 netdev_get_name(netdev),
3975 tc_get_major(handle), tc_get_minor(handle),
3976 tc_get_major(parent), tc_get_minor(parent),
3982 /* Equivalent to "tc class del dev <name> handle <handle>". */
3984 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3986 struct ofpbuf request;
3987 struct tcmsg *tcmsg;
3990 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3994 tcmsg->tcm_handle = handle;
3995 tcmsg->tcm_parent = 0;
3997 error = tc_transact(&request, NULL);
3999 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4000 netdev_get_name(netdev),
4001 tc_get_major(handle), tc_get_minor(handle),
4007 /* Equivalent to "tc qdisc del dev <name> root". */
4009 tc_del_qdisc(struct netdev *netdev)
4011 struct netdev_dev_linux *netdev_dev =
4012 netdev_dev_linux_cast(netdev_get_dev(netdev));
4013 struct ofpbuf request;
4014 struct tcmsg *tcmsg;
4017 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4021 tcmsg->tcm_handle = tc_make_handle(1, 0);
4022 tcmsg->tcm_parent = TC_H_ROOT;
4024 error = tc_transact(&request, NULL);
4025 if (error == EINVAL) {
4026 /* EINVAL probably means that the default qdisc was in use, in which
4027 * case we've accomplished our purpose. */
4030 if (!error && netdev_dev->tc) {
4031 if (netdev_dev->tc->ops->tc_destroy) {
4032 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4034 netdev_dev->tc = NULL;
4039 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4040 * kernel to determine what they are. Returns 0 if successful, otherwise a
4041 * positive errno value. */
4043 tc_query_qdisc(const struct netdev *netdev)
4045 struct netdev_dev_linux *netdev_dev =
4046 netdev_dev_linux_cast(netdev_get_dev(netdev));
4047 struct ofpbuf request, *qdisc;
4048 const struct tc_ops *ops;
4049 struct tcmsg *tcmsg;
4053 if (netdev_dev->tc) {
4057 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4058 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4059 * 2.6.35 without that fix backported to it.
4061 * To avoid the OOPS, we must not make a request that would attempt to dump
4062 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4063 * few others. There are a few ways that I can see to do this, but most of
4064 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4065 * technique chosen here is to assume that any non-default qdisc that we
4066 * create will have a class with handle 1:0. The built-in qdiscs only have
4067 * a class with handle 0:0.
4069 * We could check for Linux 2.6.35+ and use a more straightforward method
4071 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4075 tcmsg->tcm_handle = tc_make_handle(1, 0);
4076 tcmsg->tcm_parent = 0;
4078 /* Figure out what tc class to instantiate. */
4079 error = tc_transact(&request, &qdisc);
4083 error = tc_parse_qdisc(qdisc, &kind, NULL);
4085 ops = &tc_ops_other;
4087 ops = tc_lookup_linux_name(kind);
4089 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4090 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4092 ops = &tc_ops_other;
4095 } else if (error == ENOENT) {
4096 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4097 * other entity that doesn't have a handle 1:0. We will assume
4098 * that it's the system default qdisc. */
4099 ops = &tc_ops_default;
4102 /* Who knows? Maybe the device got deleted. */
4103 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4104 netdev_get_name(netdev), strerror(error));
4105 ops = &tc_ops_other;
4108 /* Instantiate it. */
4109 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4110 assert((load_error == 0) == (netdev_dev->tc != NULL));
4111 ofpbuf_delete(qdisc);
4113 return error ? error : load_error;
4116 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4117 approximate the time to transmit packets of various lengths. For an MTU of
4118 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4119 represents two possible packet lengths; for a MTU of 513 through 1024, four
4120 possible lengths; and so on.
4122 Returns, for the specified 'mtu', the number of bits that packet lengths
4123 need to be shifted right to fit within such a 256-entry table. */
4125 tc_calc_cell_log(unsigned int mtu)
4130 mtu = ETH_PAYLOAD_MAX;
4132 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4134 for (cell_log = 0; mtu >= 256; cell_log++) {
4141 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4144 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4146 memset(rate, 0, sizeof *rate);
4147 rate->cell_log = tc_calc_cell_log(mtu);
4148 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4149 /* rate->cell_align = 0; */ /* distro headers. */
4150 rate->mpu = ETH_TOTAL_MIN;
4154 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4155 * attribute of the specified "type".
4157 * See tc_calc_cell_log() above for a description of "rtab"s. */
4159 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4164 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4165 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4166 unsigned packet_size = (i + 1) << rate->cell_log;
4167 if (packet_size < rate->mpu) {
4168 packet_size = rate->mpu;
4170 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4174 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4175 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4176 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4179 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4181 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4182 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4185 /* Linux-only functions declared in netdev-linux.h */
4187 /* Returns a fd for an AF_INET socket or a negative errno value. */
4189 netdev_linux_get_af_inet_sock(void)
4191 int error = netdev_linux_init();
4192 return error ? -error : af_inet_sock;
4195 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4196 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4198 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4199 const char *flag_name, bool enable)
4201 const char *netdev_name = netdev_get_name(netdev);
4202 struct ethtool_value evalue;
4206 COVERAGE_INC(netdev_get_ethtool);
4207 memset(&evalue, 0, sizeof evalue);
4208 error = netdev_linux_do_ethtool(netdev_name,
4209 (struct ethtool_cmd *)&evalue,
4210 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4215 COVERAGE_INC(netdev_set_ethtool);
4216 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4217 error = netdev_linux_do_ethtool(netdev_name,
4218 (struct ethtool_cmd *)&evalue,
4219 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4224 COVERAGE_INC(netdev_get_ethtool);
4225 memset(&evalue, 0, sizeof evalue);
4226 error = netdev_linux_do_ethtool(netdev_name,
4227 (struct ethtool_cmd *)&evalue,
4228 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4233 if (new_flags != evalue.data) {
4234 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4235 "device %s failed", enable ? "enable" : "disable",
4236 flag_name, netdev_name);
4243 /* Utility functions. */
4245 /* Copies 'src' into 'dst', performing format conversion in the process. */
4247 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4248 const struct rtnl_link_stats *src)
4250 dst->rx_packets = src->rx_packets;
4251 dst->tx_packets = src->tx_packets;
4252 dst->rx_bytes = src->rx_bytes;
4253 dst->tx_bytes = src->tx_bytes;
4254 dst->rx_errors = src->rx_errors;
4255 dst->tx_errors = src->tx_errors;
4256 dst->rx_dropped = src->rx_dropped;
4257 dst->tx_dropped = src->tx_dropped;
4258 dst->multicast = src->multicast;
4259 dst->collisions = src->collisions;
4260 dst->rx_length_errors = src->rx_length_errors;
4261 dst->rx_over_errors = src->rx_over_errors;
4262 dst->rx_crc_errors = src->rx_crc_errors;
4263 dst->rx_frame_errors = src->rx_frame_errors;
4264 dst->rx_fifo_errors = src->rx_fifo_errors;
4265 dst->rx_missed_errors = src->rx_missed_errors;
4266 dst->tx_aborted_errors = src->tx_aborted_errors;
4267 dst->tx_carrier_errors = src->tx_carrier_errors;
4268 dst->tx_fifo_errors = src->tx_fifo_errors;
4269 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4270 dst->tx_window_errors = src->tx_window_errors;
4274 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4276 /* Policy for RTNLGRP_LINK messages.
4278 * There are *many* more fields in these messages, but currently we only
4279 * care about these fields. */
4280 static const struct nl_policy rtnlgrp_link_policy[] = {
4281 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4282 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4283 .min_len = sizeof(struct rtnl_link_stats) },
4286 struct ofpbuf request;
4287 struct ofpbuf *reply;
4288 struct ifinfomsg *ifi;
4289 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4292 ofpbuf_init(&request, 0);
4293 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4294 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4295 ifi->ifi_family = PF_UNSPEC;
4296 ifi->ifi_index = ifindex;
4297 error = nl_sock_transact(rtnl_sock, &request, &reply);
4298 ofpbuf_uninit(&request);
4303 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4304 rtnlgrp_link_policy,
4305 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4306 ofpbuf_delete(reply);
4310 if (!attrs[IFLA_STATS]) {
4311 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4312 ofpbuf_delete(reply);
4316 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4318 ofpbuf_delete(reply);
4324 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4326 static const char fn[] = "/proc/net/dev";
4331 stream = fopen(fn, "r");
4333 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4338 while (fgets(line, sizeof line, stream)) {
4341 #define X64 "%"SCNu64
4344 X64 X64 X64 X64 X64 X64 X64 "%*u"
4345 X64 X64 X64 X64 X64 X64 X64 "%*u",
4351 &stats->rx_fifo_errors,
4352 &stats->rx_frame_errors,
4358 &stats->tx_fifo_errors,
4360 &stats->tx_carrier_errors) != 15) {
4361 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4362 } else if (!strcmp(devname, netdev_name)) {
4363 stats->rx_length_errors = UINT64_MAX;
4364 stats->rx_over_errors = UINT64_MAX;
4365 stats->rx_crc_errors = UINT64_MAX;
4366 stats->rx_missed_errors = UINT64_MAX;
4367 stats->tx_aborted_errors = UINT64_MAX;
4368 stats->tx_heartbeat_errors = UINT64_MAX;
4369 stats->tx_window_errors = UINT64_MAX;
4375 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4381 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4387 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4390 *flags = ifr.ifr_flags;
4396 set_flags(struct netdev *netdev, unsigned int flags)
4400 ifr.ifr_flags = flags;
4401 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4406 do_get_ifindex(const char *netdev_name)
4410 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4411 COVERAGE_INC(netdev_get_ifindex);
4412 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4413 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4414 netdev_name, strerror(errno));
4417 return ifr.ifr_ifindex;
4421 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4423 struct netdev_dev_linux *netdev_dev =
4424 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4426 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4427 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4430 netdev_dev->get_ifindex_error = -ifindex;
4431 netdev_dev->ifindex = 0;
4433 netdev_dev->get_ifindex_error = 0;
4434 netdev_dev->ifindex = ifindex;
4436 netdev_dev->cache_valid |= VALID_IFINDEX;
4439 *ifindexp = netdev_dev->ifindex;
4440 return netdev_dev->get_ifindex_error;
4444 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4449 memset(&ifr, 0, sizeof ifr);
4450 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4451 COVERAGE_INC(netdev_get_hwaddr);
4452 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4453 /* ENODEV probably means that a vif disappeared asynchronously and
4454 * hasn't been removed from the database yet, so reduce the log level
4455 * to INFO for that case. */
4456 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4457 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4458 netdev_name, strerror(errno));
4461 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4462 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4463 VLOG_WARN("%s device has unknown hardware address family %d",
4464 netdev_name, hwaddr_family);
4466 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4471 set_etheraddr(const char *netdev_name,
4472 const uint8_t mac[ETH_ADDR_LEN])
4476 memset(&ifr, 0, sizeof ifr);
4477 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4478 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4479 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4480 COVERAGE_INC(netdev_set_hwaddr);
4481 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4482 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4483 netdev_name, strerror(errno));
4490 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4491 int cmd, const char *cmd_name)
4495 memset(&ifr, 0, sizeof ifr);
4496 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4497 ifr.ifr_data = (caddr_t) ecmd;
4500 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4503 if (errno != EOPNOTSUPP) {
4504 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4505 "failed: %s", cmd_name, name, strerror(errno));
4507 /* The device doesn't support this operation. That's pretty
4508 * common, so there's no point in logging anything. */
4515 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4516 const char *cmd_name)
4518 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4519 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4520 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4528 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4529 int cmd, const char *cmd_name)
4534 ifr.ifr_addr.sa_family = AF_INET;
4535 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4537 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4538 *ip = sin->sin_addr;
4543 /* Returns an AF_PACKET raw socket or a negative errno value. */
4545 af_packet_sock(void)
4547 static int sock = INT_MIN;
4549 if (sock == INT_MIN) {
4550 sock = socket(AF_PACKET, SOCK_RAW, 0);
4552 set_nonblocking(sock);
4555 VLOG_ERR("failed to create packet socket: %s", strerror(errno));