2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
120 VALID_FEATURES = 1 << 8,
128 /* Traffic control. */
130 /* An instance of a traffic control class. Always associated with a particular
133 * Each TC implementation subclasses this with whatever additional data it
136 const struct tc_ops *ops;
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
142 /* One traffic control queue.
144 * Each TC implementation subclasses this with whatever additional data it
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
151 /* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
185 int (*tc_install)(struct netdev *netdev, const struct shash *details);
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
223 * This function may be null if 'tc' is not configurable.
225 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function may be null if 'tc' is not configurable.
236 int (*qdisc_set)(struct netdev *, const struct shash *details);
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
249 * This function may be null if 'tc' does not have queues ('n_queues' is
251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
252 struct shash *details);
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
266 const struct shash *details);
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
278 * On success, initializes '*stats'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
284 struct netdev_queue_stats *stats);
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
297 tc_init(struct tc *tc, const struct tc_ops *ops)
300 hmap_init(&tc->queues);
304 tc_destroy(struct tc *tc)
306 hmap_destroy(&tc->queues);
309 static const struct tc_ops tc_ops_htb;
310 static const struct tc_ops tc_ops_hfsc;
311 static const struct tc_ops tc_ops_default;
312 static const struct tc_ops tc_ops_other;
314 static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
322 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323 static unsigned int tc_get_major(unsigned int handle);
324 static unsigned int tc_get_minor(unsigned int handle);
326 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
330 static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342 static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345 static int tc_delete_class(const struct netdev *, unsigned int handle);
347 static int tc_del_qdisc(struct netdev *netdev);
348 static int tc_query_qdisc(const struct netdev *netdev);
350 static int tc_calc_cell_log(unsigned int mtu);
351 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
356 struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
359 struct shash_node *shash_node;
360 unsigned int cache_valid;
361 unsigned int change_seq;
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
370 uint8_t etheraddr[ETH_ADDR_LEN];
371 struct in_addr address, netmask;
374 unsigned int ifi_flags;
375 long long int carrier_resets;
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 int netdev_policing_error; /* Cached error code from set policing. */
383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
385 uint32_t current; /* Cached from ETHTOOL_GSET. */
386 uint32_t advertised; /* Cached from ETHTOOL_GSET. */
387 uint32_t supported; /* Cached from ETHTOOL_GSET. */
388 uint32_t peer; /* Cached from ETHTOOL_GSET. */
390 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
394 struct tap_state tap;
398 struct netdev_linux {
399 struct netdev netdev;
403 /* Sockets used for ioctl operations. */
404 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
406 /* A Netlink routing socket that is not subscribed to any multicast groups. */
407 static struct nl_sock *rtnl_sock;
409 /* This is set pretty low because we probably won't learn anything from the
410 * additional log messages. */
411 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
413 static int netdev_linux_init(void);
415 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
416 int cmd, const char *cmd_name);
417 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
418 const char *cmd_name);
419 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
420 int cmd, const char *cmd_name);
421 static int get_flags(const struct netdev_dev *, unsigned int *flags);
422 static int set_flags(struct netdev *, unsigned int flags);
423 static int do_get_ifindex(const char *netdev_name);
424 static int get_ifindex(const struct netdev *, int *ifindexp);
425 static int do_set_addr(struct netdev *netdev,
426 int ioctl_nr, const char *ioctl_name,
427 struct in_addr addr);
428 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
429 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
430 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
431 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
432 static int af_packet_sock(void);
433 static void netdev_linux_miimon_run(void);
434 static void netdev_linux_miimon_wait(void);
437 is_netdev_linux_class(const struct netdev_class *netdev_class)
439 return netdev_class->init == netdev_linux_init;
442 static struct netdev_dev_linux *
443 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
445 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
446 assert(is_netdev_linux_class(netdev_class));
448 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
451 static struct netdev_linux *
452 netdev_linux_cast(const struct netdev *netdev)
454 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
455 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
456 assert(is_netdev_linux_class(netdev_class));
458 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
462 netdev_linux_init(void)
464 static int status = -1;
466 /* Create AF_INET socket. */
467 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
468 status = af_inet_sock >= 0 ? 0 : errno;
470 VLOG_ERR("failed to create inet socket: %s", strerror(status));
473 /* Create rtnetlink socket. */
475 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
477 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
486 netdev_linux_run(void)
488 rtnetlink_link_run();
489 netdev_linux_miimon_run();
493 netdev_linux_wait(void)
495 rtnetlink_link_wait();
496 netdev_linux_miimon_wait();
500 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
505 if (netdev_dev->cache_valid & VALID_DRVINFO) {
509 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
510 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
511 (struct ethtool_cmd *)&netdev_dev->drvinfo,
515 netdev_dev->cache_valid |= VALID_DRVINFO;
521 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
522 unsigned int ifi_flags,
526 if (!dev->change_seq) {
530 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
531 dev->carrier_resets++;
533 dev->ifi_flags = ifi_flags;
535 dev->cache_valid &= mask;
539 netdev_dev_linux_update(struct netdev_dev_linux *dev,
540 const struct rtnetlink_link_change *change)
542 if (change->nlmsg_type == RTM_NEWLINK) {
544 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
547 dev->mtu = change->mtu;
548 dev->cache_valid |= VALID_MTU;
549 dev->netdev_mtu_error = 0;
552 if (!eth_addr_is_zero(change->addr)) {
553 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
554 dev->cache_valid |= VALID_ETHERADDR;
555 dev->ether_addr_error = 0;
559 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
564 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
565 void *aux OVS_UNUSED)
567 struct netdev_dev_linux *dev;
569 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
571 const struct netdev_class *netdev_class =
572 netdev_dev_get_class(base_dev);
574 if (is_netdev_linux_class(netdev_class)) {
575 dev = netdev_dev_linux_cast(base_dev);
576 netdev_dev_linux_update(dev, change);
580 struct shash device_shash;
581 struct shash_node *node;
583 shash_init(&device_shash);
584 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
585 SHASH_FOR_EACH (node, &device_shash) {
590 get_flags(&dev->netdev_dev, &flags);
591 netdev_dev_linux_changed(dev, flags, 0);
593 shash_destroy(&device_shash);
598 cache_notifier_ref(void)
600 if (!cache_notifier_refcount) {
601 assert(!netdev_linux_cache_notifier);
603 netdev_linux_cache_notifier =
604 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
606 if (!netdev_linux_cache_notifier) {
610 cache_notifier_refcount++;
616 cache_notifier_unref(void)
618 assert(cache_notifier_refcount > 0);
619 if (!--cache_notifier_refcount) {
620 assert(netdev_linux_cache_notifier);
621 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
622 netdev_linux_cache_notifier = NULL;
626 /* Creates system and internal devices. */
628 netdev_linux_create(const struct netdev_class *class, const char *name,
629 struct netdev_dev **netdev_devp)
631 struct netdev_dev_linux *netdev_dev;
634 error = cache_notifier_ref();
639 netdev_dev = xzalloc(sizeof *netdev_dev);
640 netdev_dev->change_seq = 1;
641 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
642 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
644 *netdev_devp = &netdev_dev->netdev_dev;
648 /* For most types of netdevs we open the device for each call of
649 * netdev_open(). However, this is not the case with tap devices,
650 * since it is only possible to open the device once. In this
651 * situation we share a single file descriptor, and consequently
652 * buffers, across all readers. Therefore once data is read it will
653 * be unavailable to other reads for tap devices. */
655 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
656 const char *name, struct netdev_dev **netdev_devp)
658 struct netdev_dev_linux *netdev_dev;
659 struct tap_state *state;
660 static const char tap_dev[] = "/dev/net/tun";
664 netdev_dev = xzalloc(sizeof *netdev_dev);
665 state = &netdev_dev->state.tap;
667 error = cache_notifier_ref();
672 /* Open tap device. */
673 state->fd = open(tap_dev, O_RDWR);
676 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
677 goto error_unref_notifier;
680 /* Create tap device. */
681 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
682 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
683 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
684 VLOG_WARN("%s: creating tap device failed: %s", name,
687 goto error_unref_notifier;
690 /* Make non-blocking. */
691 error = set_nonblocking(state->fd);
693 goto error_unref_notifier;
696 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
697 *netdev_devp = &netdev_dev->netdev_dev;
700 error_unref_notifier:
701 cache_notifier_unref();
708 destroy_tap(struct netdev_dev_linux *netdev_dev)
710 struct tap_state *state = &netdev_dev->state.tap;
712 if (state->fd >= 0) {
717 /* Destroys the netdev device 'netdev_dev_'. */
719 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
721 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
722 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
724 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
725 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
728 if (class == &netdev_tap_class) {
729 destroy_tap(netdev_dev);
733 cache_notifier_unref();
737 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
739 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
740 struct netdev_linux *netdev;
741 enum netdev_flags flags;
744 /* Allocate network device. */
745 netdev = xzalloc(sizeof *netdev);
747 netdev_init(&netdev->netdev, netdev_dev_);
749 /* Verify that the device really exists, by attempting to read its flags.
750 * (The flags might be cached, in which case this won't actually do an
753 * Don't do this for "internal" netdevs, though, because those have to be
754 * created as netdev objects before they exist in the kernel, because
755 * creating them in the kernel happens by passing a netdev object to
756 * dpif_port_add(). */
757 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
758 error = netdev_get_flags(&netdev->netdev, &flags);
759 if (error == ENODEV) {
764 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
765 !netdev_dev->state.tap.opened) {
767 /* We assume that the first user of the tap device is the primary user
768 * and give them the tap FD. Subsequent users probably just expect
769 * this to be a system device so open it normally to avoid send/receive
770 * directions appearing to be reversed. */
771 netdev->fd = netdev_dev->state.tap.fd;
772 netdev_dev->state.tap.opened = true;
775 *netdevp = &netdev->netdev;
779 netdev_uninit(&netdev->netdev, true);
783 /* Closes and destroys 'netdev'. */
785 netdev_linux_close(struct netdev *netdev_)
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
796 netdev_linux_listen(struct netdev *netdev_)
798 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799 struct sockaddr_ll sll;
804 if (netdev->fd >= 0) {
808 /* Create file descriptor. */
809 fd = socket(PF_PACKET, SOCK_RAW, 0);
812 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
816 /* Set non-blocking mode. */
817 error = set_nonblocking(fd);
822 /* Get ethernet device index. */
823 error = get_ifindex(&netdev->netdev, &ifindex);
828 /* Bind to specific ethernet device. */
829 memset(&sll, 0, sizeof sll);
830 sll.sll_family = AF_PACKET;
831 sll.sll_ifindex = ifindex;
832 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
833 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
835 VLOG_ERR("%s: failed to bind raw socket (%s)",
836 netdev_get_name(netdev_), strerror(error));
851 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
853 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
855 if (netdev->fd < 0) {
856 /* Device is not listening. */
863 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
864 ? read(netdev->fd, data, size)
865 : recv(netdev->fd, data, size, MSG_TRUNC));
867 return retval <= size ? retval : -EMSGSIZE;
868 } else if (errno != EINTR) {
869 if (errno != EAGAIN) {
870 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
871 strerror(errno), netdev_get_name(netdev_));
878 /* Registers with the poll loop to wake up from the next call to poll_block()
879 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
881 netdev_linux_recv_wait(struct netdev *netdev_)
883 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
884 if (netdev->fd >= 0) {
885 poll_fd_wait(netdev->fd, POLLIN);
889 /* Discards all packets waiting to be received from 'netdev'. */
891 netdev_linux_drain(struct netdev *netdev_)
893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
894 if (netdev->fd < 0) {
896 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
898 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
899 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
903 drain_fd(netdev->fd, ifr.ifr_qlen);
906 return drain_rcvbuf(netdev->fd);
910 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
911 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
912 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
913 * the packet is too big or too small to transmit on the device.
915 * The caller retains ownership of 'buffer' in all cases.
917 * The kernel maintains a packet transmission queue, so the caller is not
918 * expected to do additional queuing of packets. */
920 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
922 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
926 if (netdev->fd < 0) {
927 /* Use our AF_PACKET socket to send to this device. */
928 struct sockaddr_ll sll;
935 sock = af_packet_sock();
940 error = get_ifindex(netdev_, &ifindex);
945 /* We don't bother setting most fields in sockaddr_ll because the
946 * kernel ignores them for SOCK_RAW. */
947 memset(&sll, 0, sizeof sll);
948 sll.sll_family = AF_PACKET;
949 sll.sll_ifindex = ifindex;
951 iov.iov_base = (void *) data;
955 msg.msg_namelen = sizeof sll;
958 msg.msg_control = NULL;
959 msg.msg_controllen = 0;
962 retval = sendmsg(sock, &msg, 0);
964 /* Use the netdev's own fd to send to this device. This is
965 * essential for tap devices, because packets sent to a tap device
966 * with an AF_PACKET socket will loop back to be *received* again
967 * on the tap device. */
968 retval = write(netdev->fd, data, size);
972 /* The Linux AF_PACKET implementation never blocks waiting for room
973 * for packets, instead returning ENOBUFS. Translate this into
974 * EAGAIN for the caller. */
975 if (errno == ENOBUFS) {
977 } else if (errno == EINTR) {
979 } else if (errno != EAGAIN) {
980 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
981 netdev_get_name(netdev_), strerror(errno));
984 } else if (retval != size) {
985 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
986 "%zu) on %s", retval, size, netdev_get_name(netdev_));
994 /* Registers with the poll loop to wake up from the next call to poll_block()
995 * when the packet transmission queue has sufficient room to transmit a packet
996 * with netdev_send().
998 * The kernel maintains a packet transmission queue, so the client is not
999 * expected to do additional queuing of packets. Thus, this function is
1000 * unlikely to ever be used. It is included for completeness. */
1002 netdev_linux_send_wait(struct netdev *netdev_)
1004 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1005 if (netdev->fd < 0) {
1006 /* Nothing to do. */
1007 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1008 poll_fd_wait(netdev->fd, POLLOUT);
1010 /* TAP device always accepts packets.*/
1011 poll_immediate_wake();
1015 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1016 * otherwise a positive errno value. */
1018 netdev_linux_set_etheraddr(struct netdev *netdev_,
1019 const uint8_t mac[ETH_ADDR_LEN])
1021 struct netdev_dev_linux *netdev_dev =
1022 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1025 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1026 if (netdev_dev->ether_addr_error) {
1027 return netdev_dev->ether_addr_error;
1029 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1032 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1035 error = set_etheraddr(netdev_get_name(netdev_), mac);
1036 if (!error || error == ENODEV) {
1037 netdev_dev->ether_addr_error = error;
1038 netdev_dev->cache_valid |= VALID_ETHERADDR;
1040 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1047 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1049 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1050 uint8_t mac[ETH_ADDR_LEN])
1052 struct netdev_dev_linux *netdev_dev =
1053 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1055 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1056 int error = get_etheraddr(netdev_get_name(netdev_),
1057 netdev_dev->etheraddr);
1059 netdev_dev->ether_addr_error = error;
1060 netdev_dev->cache_valid |= VALID_ETHERADDR;
1063 if (!netdev_dev->ether_addr_error) {
1064 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1067 return netdev_dev->ether_addr_error;
1070 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1071 * in bytes, not including the hardware header; thus, this is typically 1500
1072 * bytes for Ethernet devices. */
1074 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1076 struct netdev_dev_linux *netdev_dev =
1077 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1078 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1082 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1083 SIOCGIFMTU, "SIOCGIFMTU");
1085 netdev_dev->netdev_mtu_error = error;
1086 netdev_dev->mtu = ifr.ifr_mtu;
1087 netdev_dev->cache_valid |= VALID_MTU;
1090 if (!netdev_dev->netdev_mtu_error) {
1091 *mtup = netdev_dev->mtu;
1093 return netdev_dev->netdev_mtu_error;
1096 /* Sets the maximum size of transmitted (MTU) for given device using linux
1097 * networking ioctl interface.
1100 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1102 struct netdev_dev_linux *netdev_dev =
1103 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1107 if (netdev_dev->cache_valid & VALID_MTU) {
1108 if (netdev_dev->netdev_mtu_error) {
1109 return netdev_dev->netdev_mtu_error;
1111 if (netdev_dev->mtu == mtu) {
1114 netdev_dev->cache_valid &= ~VALID_MTU;
1117 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1118 SIOCSIFMTU, "SIOCSIFMTU");
1119 if (!error || error == ENODEV) {
1120 netdev_dev->netdev_mtu_error = error;
1121 netdev_dev->mtu = ifr.ifr_mtu;
1122 netdev_dev->cache_valid |= VALID_MTU;
1127 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1128 * On failure, returns a negative errno value. */
1130 netdev_linux_get_ifindex(const struct netdev *netdev)
1134 error = get_ifindex(netdev, &ifindex);
1135 return error ? -error : ifindex;
1139 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1141 struct netdev_dev_linux *netdev_dev =
1142 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1144 if (netdev_dev->miimon_interval > 0) {
1145 *carrier = netdev_dev->miimon;
1147 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1153 static long long int
1154 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1156 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1160 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1161 struct mii_ioctl_data *data)
1166 memset(&ifr, 0, sizeof ifr);
1167 memcpy(&ifr.ifr_data, data, sizeof *data);
1168 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1169 memcpy(data, &ifr.ifr_data, sizeof *data);
1175 netdev_linux_get_miimon(const char *name, bool *miimon)
1177 struct mii_ioctl_data data;
1182 memset(&data, 0, sizeof data);
1183 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1185 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1186 data.reg_num = MII_BMSR;
1187 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1191 *miimon = !!(data.val_out & BMSR_LSTATUS);
1193 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1196 struct ethtool_cmd ecmd;
1198 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1201 memset(&ecmd, 0, sizeof ecmd);
1202 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1205 struct ethtool_value eval;
1207 memcpy(&eval, &ecmd, sizeof eval);
1208 *miimon = !!eval.data;
1210 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1218 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1219 long long int interval)
1221 struct netdev_dev_linux *netdev_dev;
1223 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1225 interval = interval > 0 ? MAX(interval, 100) : 0;
1226 if (netdev_dev->miimon_interval != interval) {
1227 netdev_dev->miimon_interval = interval;
1228 timer_set_expired(&netdev_dev->miimon_timer);
1235 netdev_linux_miimon_run(void)
1237 struct shash device_shash;
1238 struct shash_node *node;
1240 shash_init(&device_shash);
1241 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1242 SHASH_FOR_EACH (node, &device_shash) {
1243 struct netdev_dev_linux *dev = node->data;
1246 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1250 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1251 if (miimon != dev->miimon) {
1252 dev->miimon = miimon;
1253 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1256 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1259 shash_destroy(&device_shash);
1263 netdev_linux_miimon_wait(void)
1265 struct shash device_shash;
1266 struct shash_node *node;
1268 shash_init(&device_shash);
1269 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1270 SHASH_FOR_EACH (node, &device_shash) {
1271 struct netdev_dev_linux *dev = node->data;
1273 if (dev->miimon_interval > 0) {
1274 timer_wait(&dev->miimon_timer);
1277 shash_destroy(&device_shash);
1280 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1281 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1284 check_for_working_netlink_stats(void)
1286 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1287 * preferable, so if that works, we'll use it. */
1288 int ifindex = do_get_ifindex("lo");
1290 VLOG_WARN("failed to get ifindex for lo, "
1291 "obtaining netdev stats from proc");
1294 struct netdev_stats stats;
1295 int error = get_stats_via_netlink(ifindex, &stats);
1297 VLOG_DBG("obtaining netdev stats via rtnetlink");
1300 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1301 "via proc (you are probably running a pre-2.6.19 "
1302 "kernel)", strerror(error));
1309 swap_uint64(uint64_t *a, uint64_t *b)
1317 get_stats_via_vport(const struct netdev *netdev_,
1318 struct netdev_stats *stats)
1320 struct netdev_dev_linux *netdev_dev =
1321 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1323 if (!netdev_dev->vport_stats_error ||
1324 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1327 error = netdev_vport_get_stats(netdev_, stats);
1329 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1330 "(%s)", netdev_get_name(netdev_), strerror(error));
1332 netdev_dev->vport_stats_error = error;
1333 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1338 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1339 struct netdev_stats *stats)
1341 static int use_netlink_stats = -1;
1344 if (use_netlink_stats < 0) {
1345 use_netlink_stats = check_for_working_netlink_stats();
1348 if (use_netlink_stats) {
1351 error = get_ifindex(netdev_, &ifindex);
1353 error = get_stats_via_netlink(ifindex, stats);
1356 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1360 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1361 netdev_get_name(netdev_), error);
1367 /* Retrieves current device stats for 'netdev-linux'. */
1369 netdev_linux_get_stats(const struct netdev *netdev_,
1370 struct netdev_stats *stats)
1372 struct netdev_dev_linux *netdev_dev =
1373 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1374 struct netdev_stats dev_stats;
1377 get_stats_via_vport(netdev_, stats);
1379 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1382 if (netdev_dev->vport_stats_error) {
1389 if (netdev_dev->vport_stats_error) {
1390 /* stats not available from OVS then use ioctl stats. */
1393 stats->rx_errors += dev_stats.rx_errors;
1394 stats->tx_errors += dev_stats.tx_errors;
1395 stats->rx_dropped += dev_stats.rx_dropped;
1396 stats->tx_dropped += dev_stats.tx_dropped;
1397 stats->multicast += dev_stats.multicast;
1398 stats->collisions += dev_stats.collisions;
1399 stats->rx_length_errors += dev_stats.rx_length_errors;
1400 stats->rx_over_errors += dev_stats.rx_over_errors;
1401 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1402 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1403 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1404 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1405 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1406 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1407 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1408 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1409 stats->tx_window_errors += dev_stats.tx_window_errors;
1414 /* Retrieves current device stats for 'netdev-tap' netdev or
1415 * netdev-internal. */
1417 netdev_tap_get_stats(const struct netdev *netdev_,
1418 struct netdev_stats *stats)
1420 struct netdev_dev_linux *netdev_dev =
1421 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1422 struct netdev_stats dev_stats;
1425 get_stats_via_vport(netdev_, stats);
1427 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1429 if (netdev_dev->vport_stats_error) {
1436 /* If this port is an internal port then the transmit and receive stats
1437 * will appear to be swapped relative to the other ports since we are the
1438 * one sending the data, not a remote computer. For consistency, we swap
1439 * them back here. This does not apply if we are getting stats from the
1440 * vport layer because it always tracks stats from the perspective of the
1442 if (netdev_dev->vport_stats_error) {
1444 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1445 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1446 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1447 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1448 stats->rx_length_errors = 0;
1449 stats->rx_over_errors = 0;
1450 stats->rx_crc_errors = 0;
1451 stats->rx_frame_errors = 0;
1452 stats->rx_fifo_errors = 0;
1453 stats->rx_missed_errors = 0;
1454 stats->tx_aborted_errors = 0;
1455 stats->tx_carrier_errors = 0;
1456 stats->tx_fifo_errors = 0;
1457 stats->tx_heartbeat_errors = 0;
1458 stats->tx_window_errors = 0;
1460 stats->rx_dropped += dev_stats.tx_dropped;
1461 stats->tx_dropped += dev_stats.rx_dropped;
1463 stats->rx_errors += dev_stats.tx_errors;
1464 stats->tx_errors += dev_stats.rx_errors;
1466 stats->multicast += dev_stats.multicast;
1467 stats->collisions += dev_stats.collisions;
1473 netdev_internal_get_stats(const struct netdev *netdev_,
1474 struct netdev_stats *stats)
1476 struct netdev_dev_linux *netdev_dev =
1477 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1479 get_stats_via_vport(netdev_, stats);
1480 return netdev_dev->vport_stats_error;
1484 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1486 struct ethtool_cmd ecmd;
1490 if (netdev_dev->cache_valid & VALID_FEATURES) {
1494 memset(&ecmd, 0, sizeof ecmd);
1495 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1496 ETHTOOL_GSET, "ETHTOOL_GSET");
1501 /* Supported features. */
1502 netdev_dev->supported = 0;
1503 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1504 netdev_dev->supported |= NETDEV_F_10MB_HD;
1506 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1507 netdev_dev->supported |= NETDEV_F_10MB_FD;
1509 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1510 netdev_dev->supported |= NETDEV_F_100MB_HD;
1512 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1513 netdev_dev->supported |= NETDEV_F_100MB_FD;
1515 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1516 netdev_dev->supported |= NETDEV_F_1GB_HD;
1518 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1519 netdev_dev->supported |= NETDEV_F_1GB_FD;
1521 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1522 netdev_dev->supported |= NETDEV_F_10GB_FD;
1524 if (ecmd.supported & SUPPORTED_TP) {
1525 netdev_dev->supported |= NETDEV_F_COPPER;
1527 if (ecmd.supported & SUPPORTED_FIBRE) {
1528 netdev_dev->supported |= NETDEV_F_FIBER;
1530 if (ecmd.supported & SUPPORTED_Autoneg) {
1531 netdev_dev->supported |= NETDEV_F_AUTONEG;
1533 if (ecmd.supported & SUPPORTED_Pause) {
1534 netdev_dev->supported |= NETDEV_F_PAUSE;
1536 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1537 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1540 /* Advertised features. */
1541 netdev_dev->advertised = 0;
1542 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1543 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1545 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1546 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1548 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1549 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1551 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1552 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1554 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1555 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1557 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1558 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1560 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1561 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1563 if (ecmd.advertising & ADVERTISED_TP) {
1564 netdev_dev->advertised |= NETDEV_F_COPPER;
1566 if (ecmd.advertising & ADVERTISED_FIBRE) {
1567 netdev_dev->advertised |= NETDEV_F_FIBER;
1569 if (ecmd.advertising & ADVERTISED_Autoneg) {
1570 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1572 if (ecmd.advertising & ADVERTISED_Pause) {
1573 netdev_dev->advertised |= NETDEV_F_PAUSE;
1575 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1576 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1579 /* Current settings. */
1581 if (speed == SPEED_10) {
1582 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1583 } else if (speed == SPEED_100) {
1584 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1585 } else if (speed == SPEED_1000) {
1586 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1587 } else if (speed == SPEED_10000) {
1588 netdev_dev->current = NETDEV_F_10GB_FD;
1589 } else if (speed == 40000) {
1590 netdev_dev->current = NETDEV_F_40GB_FD;
1591 } else if (speed == 100000) {
1592 netdev_dev->current = NETDEV_F_100GB_FD;
1593 } else if (speed == 1000000) {
1594 netdev_dev->current = NETDEV_F_1TB_FD;
1596 netdev_dev->current = 0;
1599 if (ecmd.port == PORT_TP) {
1600 netdev_dev->current |= NETDEV_F_COPPER;
1601 } else if (ecmd.port == PORT_FIBRE) {
1602 netdev_dev->current |= NETDEV_F_FIBER;
1606 netdev_dev->current |= NETDEV_F_AUTONEG;
1609 /* Peer advertisements. */
1610 netdev_dev->peer = 0; /* XXX */
1613 netdev_dev->cache_valid |= VALID_FEATURES;
1614 netdev_dev->get_features_error = error;
1617 /* Stores the features supported by 'netdev' into each of '*current',
1618 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1619 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1622 netdev_linux_get_features(const struct netdev *netdev_,
1623 uint32_t *current, uint32_t *advertised,
1624 uint32_t *supported, uint32_t *peer)
1626 struct netdev_dev_linux *netdev_dev =
1627 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1629 netdev_linux_read_features(netdev_dev);
1631 if (!netdev_dev->get_features_error) {
1632 *current = netdev_dev->current;
1633 *advertised = netdev_dev->advertised;
1634 *supported = netdev_dev->supported;
1635 *peer = netdev_dev->peer;
1637 return netdev_dev->get_features_error;
1640 /* Set the features advertised by 'netdev' to 'advertise'. */
1642 netdev_linux_set_advertisements(struct netdev *netdev,
1643 enum netdev_features advertise)
1645 struct ethtool_cmd ecmd;
1648 memset(&ecmd, 0, sizeof ecmd);
1649 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1650 ETHTOOL_GSET, "ETHTOOL_GSET");
1655 ecmd.advertising = 0;
1656 if (advertise & NETDEV_F_10MB_HD) {
1657 ecmd.advertising |= ADVERTISED_10baseT_Half;
1659 if (advertise & NETDEV_F_10MB_FD) {
1660 ecmd.advertising |= ADVERTISED_10baseT_Full;
1662 if (advertise & NETDEV_F_100MB_HD) {
1663 ecmd.advertising |= ADVERTISED_100baseT_Half;
1665 if (advertise & NETDEV_F_100MB_FD) {
1666 ecmd.advertising |= ADVERTISED_100baseT_Full;
1668 if (advertise & NETDEV_F_1GB_HD) {
1669 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1671 if (advertise & NETDEV_F_1GB_FD) {
1672 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1674 if (advertise & NETDEV_F_10GB_FD) {
1675 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1677 if (advertise & NETDEV_F_COPPER) {
1678 ecmd.advertising |= ADVERTISED_TP;
1680 if (advertise & NETDEV_F_FIBER) {
1681 ecmd.advertising |= ADVERTISED_FIBRE;
1683 if (advertise & NETDEV_F_AUTONEG) {
1684 ecmd.advertising |= ADVERTISED_Autoneg;
1686 if (advertise & NETDEV_F_PAUSE) {
1687 ecmd.advertising |= ADVERTISED_Pause;
1689 if (advertise & NETDEV_F_PAUSE_ASYM) {
1690 ecmd.advertising |= ADVERTISED_Asym_Pause;
1692 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1693 ETHTOOL_SSET, "ETHTOOL_SSET");
1696 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1697 * successful, otherwise a positive errno value. */
1699 netdev_linux_set_policing(struct netdev *netdev,
1700 uint32_t kbits_rate, uint32_t kbits_burst)
1702 struct netdev_dev_linux *netdev_dev =
1703 netdev_dev_linux_cast(netdev_get_dev(netdev));
1704 const char *netdev_name = netdev_get_name(netdev);
1708 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1709 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1710 : kbits_burst); /* Stick with user-specified value. */
1712 if (netdev_dev->cache_valid & VALID_POLICING) {
1713 if (netdev_dev->netdev_policing_error) {
1714 return netdev_dev->netdev_policing_error;
1717 if (netdev_dev->kbits_rate == kbits_rate &&
1718 netdev_dev->kbits_burst == kbits_burst) {
1719 /* Assume that settings haven't changed since we last set them. */
1722 netdev_dev->cache_valid &= ~VALID_POLICING;
1725 COVERAGE_INC(netdev_set_policing);
1726 /* Remove any existing ingress qdisc. */
1727 error = tc_add_del_ingress_qdisc(netdev, false);
1729 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1730 netdev_name, strerror(error));
1735 error = tc_add_del_ingress_qdisc(netdev, true);
1737 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1738 netdev_name, strerror(error));
1742 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1744 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1745 netdev_name, strerror(error));
1750 netdev_dev->kbits_rate = kbits_rate;
1751 netdev_dev->kbits_burst = kbits_burst;
1754 if (!error || error == ENODEV) {
1755 netdev_dev->netdev_policing_error = error;
1756 netdev_dev->cache_valid |= VALID_POLICING;
1762 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1765 const struct tc_ops **opsp;
1767 for (opsp = tcs; *opsp != NULL; opsp++) {
1768 const struct tc_ops *ops = *opsp;
1769 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1770 sset_add(types, ops->ovs_name);
1776 static const struct tc_ops *
1777 tc_lookup_ovs_name(const char *name)
1779 const struct tc_ops **opsp;
1781 for (opsp = tcs; *opsp != NULL; opsp++) {
1782 const struct tc_ops *ops = *opsp;
1783 if (!strcmp(name, ops->ovs_name)) {
1790 static const struct tc_ops *
1791 tc_lookup_linux_name(const char *name)
1793 const struct tc_ops **opsp;
1795 for (opsp = tcs; *opsp != NULL; opsp++) {
1796 const struct tc_ops *ops = *opsp;
1797 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1804 static struct tc_queue *
1805 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1808 struct netdev_dev_linux *netdev_dev =
1809 netdev_dev_linux_cast(netdev_get_dev(netdev));
1810 struct tc_queue *queue;
1812 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1813 if (queue->queue_id == queue_id) {
1820 static struct tc_queue *
1821 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1823 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1827 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1829 struct netdev_qos_capabilities *caps)
1831 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1835 caps->n_queues = ops->n_queues;
1840 netdev_linux_get_qos(const struct netdev *netdev,
1841 const char **typep, struct shash *details)
1843 struct netdev_dev_linux *netdev_dev =
1844 netdev_dev_linux_cast(netdev_get_dev(netdev));
1847 error = tc_query_qdisc(netdev);
1852 *typep = netdev_dev->tc->ops->ovs_name;
1853 return (netdev_dev->tc->ops->qdisc_get
1854 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1859 netdev_linux_set_qos(struct netdev *netdev,
1860 const char *type, const struct shash *details)
1862 struct netdev_dev_linux *netdev_dev =
1863 netdev_dev_linux_cast(netdev_get_dev(netdev));
1864 const struct tc_ops *new_ops;
1867 new_ops = tc_lookup_ovs_name(type);
1868 if (!new_ops || !new_ops->tc_install) {
1872 error = tc_query_qdisc(netdev);
1877 if (new_ops == netdev_dev->tc->ops) {
1878 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1880 /* Delete existing qdisc. */
1881 error = tc_del_qdisc(netdev);
1885 assert(netdev_dev->tc == NULL);
1887 /* Install new qdisc. */
1888 error = new_ops->tc_install(netdev, details);
1889 assert((error == 0) == (netdev_dev->tc != NULL));
1896 netdev_linux_get_queue(const struct netdev *netdev,
1897 unsigned int queue_id, struct shash *details)
1899 struct netdev_dev_linux *netdev_dev =
1900 netdev_dev_linux_cast(netdev_get_dev(netdev));
1903 error = tc_query_qdisc(netdev);
1907 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1909 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1915 netdev_linux_set_queue(struct netdev *netdev,
1916 unsigned int queue_id, const struct shash *details)
1918 struct netdev_dev_linux *netdev_dev =
1919 netdev_dev_linux_cast(netdev_get_dev(netdev));
1922 error = tc_query_qdisc(netdev);
1925 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1926 || !netdev_dev->tc->ops->class_set) {
1930 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1934 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1936 struct netdev_dev_linux *netdev_dev =
1937 netdev_dev_linux_cast(netdev_get_dev(netdev));
1940 error = tc_query_qdisc(netdev);
1943 } else if (!netdev_dev->tc->ops->class_delete) {
1946 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1948 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1954 netdev_linux_get_queue_stats(const struct netdev *netdev,
1955 unsigned int queue_id,
1956 struct netdev_queue_stats *stats)
1958 struct netdev_dev_linux *netdev_dev =
1959 netdev_dev_linux_cast(netdev_get_dev(netdev));
1962 error = tc_query_qdisc(netdev);
1965 } else if (!netdev_dev->tc->ops->class_get_stats) {
1968 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1970 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1976 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1978 struct ofpbuf request;
1979 struct tcmsg *tcmsg;
1981 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1985 tcmsg->tcm_parent = 0;
1986 nl_dump_start(dump, rtnl_sock, &request);
1987 ofpbuf_uninit(&request);
1992 netdev_linux_dump_queues(const struct netdev *netdev,
1993 netdev_dump_queues_cb *cb, void *aux)
1995 struct netdev_dev_linux *netdev_dev =
1996 netdev_dev_linux_cast(netdev_get_dev(netdev));
1997 struct tc_queue *queue;
1998 struct shash details;
2002 error = tc_query_qdisc(netdev);
2005 } else if (!netdev_dev->tc->ops->class_get) {
2010 shash_init(&details);
2011 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
2012 shash_clear(&details);
2014 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2016 (*cb)(queue->queue_id, &details, aux);
2021 shash_destroy(&details);
2027 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2028 netdev_dump_queue_stats_cb *cb, void *aux)
2030 struct netdev_dev_linux *netdev_dev =
2031 netdev_dev_linux_cast(netdev_get_dev(netdev));
2032 struct nl_dump dump;
2037 error = tc_query_qdisc(netdev);
2040 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2045 if (!start_queue_dump(netdev, &dump)) {
2048 while (nl_dump_next(&dump, &msg)) {
2049 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2055 error = nl_dump_done(&dump);
2056 return error ? error : last_error;
2060 netdev_linux_get_in4(const struct netdev *netdev_,
2061 struct in_addr *address, struct in_addr *netmask)
2063 struct netdev_dev_linux *netdev_dev =
2064 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2066 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2069 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2070 SIOCGIFADDR, "SIOCGIFADDR");
2075 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2076 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2081 netdev_dev->cache_valid |= VALID_IN4;
2083 *address = netdev_dev->address;
2084 *netmask = netdev_dev->netmask;
2085 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2089 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2090 struct in_addr netmask)
2092 struct netdev_dev_linux *netdev_dev =
2093 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2096 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2098 netdev_dev->cache_valid |= VALID_IN4;
2099 netdev_dev->address = address;
2100 netdev_dev->netmask = netmask;
2101 if (address.s_addr != INADDR_ANY) {
2102 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2103 "SIOCSIFNETMASK", netmask);
2110 parse_if_inet6_line(const char *line,
2111 struct in6_addr *in6, char ifname[16 + 1])
2113 uint8_t *s6 = in6->s6_addr;
2114 #define X8 "%2"SCNx8
2116 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2117 "%*x %*x %*x %*x %16s\n",
2118 &s6[0], &s6[1], &s6[2], &s6[3],
2119 &s6[4], &s6[5], &s6[6], &s6[7],
2120 &s6[8], &s6[9], &s6[10], &s6[11],
2121 &s6[12], &s6[13], &s6[14], &s6[15],
2125 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2126 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2128 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2130 struct netdev_dev_linux *netdev_dev =
2131 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2132 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2136 netdev_dev->in6 = in6addr_any;
2138 file = fopen("/proc/net/if_inet6", "r");
2140 const char *name = netdev_get_name(netdev_);
2141 while (fgets(line, sizeof line, file)) {
2142 struct in6_addr in6_tmp;
2143 char ifname[16 + 1];
2144 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2145 && !strcmp(name, ifname))
2147 netdev_dev->in6 = in6_tmp;
2153 netdev_dev->cache_valid |= VALID_IN6;
2155 *in6 = netdev_dev->in6;
2160 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2162 struct sockaddr_in sin;
2163 memset(&sin, 0, sizeof sin);
2164 sin.sin_family = AF_INET;
2165 sin.sin_addr = addr;
2168 memset(sa, 0, sizeof *sa);
2169 memcpy(sa, &sin, sizeof sin);
2173 do_set_addr(struct netdev *netdev,
2174 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2177 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2178 make_in4_sockaddr(&ifr.ifr_addr, addr);
2180 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2184 /* Adds 'router' as a default IP gateway. */
2186 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2188 struct in_addr any = { INADDR_ANY };
2192 memset(&rt, 0, sizeof rt);
2193 make_in4_sockaddr(&rt.rt_dst, any);
2194 make_in4_sockaddr(&rt.rt_gateway, router);
2195 make_in4_sockaddr(&rt.rt_genmask, any);
2196 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2197 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2199 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2205 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2208 static const char fn[] = "/proc/net/route";
2213 *netdev_name = NULL;
2214 stream = fopen(fn, "r");
2215 if (stream == NULL) {
2216 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2221 while (fgets(line, sizeof line, stream)) {
2224 ovs_be32 dest, gateway, mask;
2225 int refcnt, metric, mtu;
2226 unsigned int flags, use, window, irtt;
2229 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2231 iface, &dest, &gateway, &flags, &refcnt,
2232 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2234 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2238 if (!(flags & RTF_UP)) {
2239 /* Skip routes that aren't up. */
2243 /* The output of 'dest', 'mask', and 'gateway' were given in
2244 * network byte order, so we don't need need any endian
2245 * conversions here. */
2246 if ((dest & mask) == (host->s_addr & mask)) {
2248 /* The host is directly reachable. */
2249 next_hop->s_addr = 0;
2251 /* To reach the host, we must go through a gateway. */
2252 next_hop->s_addr = gateway;
2254 *netdev_name = xstrdup(iface);
2266 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2269 struct netdev_dev_linux *netdev_dev =
2270 netdev_dev_linux_cast(netdev_get_dev(netdev));
2272 error = netdev_linux_get_drvinfo(netdev_dev);
2274 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2275 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2276 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2282 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2284 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2288 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2289 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2290 * returns 0. Otherwise, it returns a positive errno value; in particular,
2291 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2293 netdev_linux_arp_lookup(const struct netdev *netdev,
2294 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2297 struct sockaddr_in sin;
2300 memset(&r, 0, sizeof r);
2301 memset(&sin, 0, sizeof sin);
2302 sin.sin_family = AF_INET;
2303 sin.sin_addr.s_addr = ip;
2305 memcpy(&r.arp_pa, &sin, sizeof sin);
2306 r.arp_ha.sa_family = ARPHRD_ETHER;
2308 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2309 COVERAGE_INC(netdev_arp_lookup);
2310 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2312 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2313 } else if (retval != ENXIO) {
2314 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2315 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2321 nd_to_iff_flags(enum netdev_flags nd)
2324 if (nd & NETDEV_UP) {
2327 if (nd & NETDEV_PROMISC) {
2334 iff_to_nd_flags(int iff)
2336 enum netdev_flags nd = 0;
2340 if (iff & IFF_PROMISC) {
2341 nd |= NETDEV_PROMISC;
2347 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2348 enum netdev_flags on, enum netdev_flags *old_flagsp)
2350 struct netdev_dev_linux *netdev_dev;
2351 int old_flags, new_flags;
2354 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2355 old_flags = netdev_dev->ifi_flags;
2356 *old_flagsp = iff_to_nd_flags(old_flags);
2357 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2358 if (new_flags != old_flags) {
2359 error = set_flags(netdev, new_flags);
2360 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2366 netdev_linux_change_seq(const struct netdev *netdev)
2368 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2371 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2372 GET_FEATURES, GET_STATUS) \
2376 netdev_linux_init, \
2378 netdev_linux_wait, \
2381 netdev_linux_destroy, \
2382 NULL, /* get_config */ \
2383 NULL, /* set_config */ \
2385 netdev_linux_open, \
2386 netdev_linux_close, \
2388 netdev_linux_listen, \
2389 netdev_linux_recv, \
2390 netdev_linux_recv_wait, \
2391 netdev_linux_drain, \
2393 netdev_linux_send, \
2394 netdev_linux_send_wait, \
2396 netdev_linux_set_etheraddr, \
2397 netdev_linux_get_etheraddr, \
2398 netdev_linux_get_mtu, \
2399 netdev_linux_set_mtu, \
2400 netdev_linux_get_ifindex, \
2401 netdev_linux_get_carrier, \
2402 netdev_linux_get_carrier_resets, \
2403 netdev_linux_set_miimon_interval, \
2408 netdev_linux_set_advertisements, \
2410 netdev_linux_set_policing, \
2411 netdev_linux_get_qos_types, \
2412 netdev_linux_get_qos_capabilities, \
2413 netdev_linux_get_qos, \
2414 netdev_linux_set_qos, \
2415 netdev_linux_get_queue, \
2416 netdev_linux_set_queue, \
2417 netdev_linux_delete_queue, \
2418 netdev_linux_get_queue_stats, \
2419 netdev_linux_dump_queues, \
2420 netdev_linux_dump_queue_stats, \
2422 netdev_linux_get_in4, \
2423 netdev_linux_set_in4, \
2424 netdev_linux_get_in6, \
2425 netdev_linux_add_router, \
2426 netdev_linux_get_next_hop, \
2428 netdev_linux_arp_lookup, \
2430 netdev_linux_update_flags, \
2432 netdev_linux_change_seq \
2435 const struct netdev_class netdev_linux_class =
2438 netdev_linux_create,
2439 netdev_linux_get_stats,
2440 NULL, /* set_stats */
2441 netdev_linux_get_features,
2442 netdev_linux_get_status);
2444 const struct netdev_class netdev_tap_class =
2447 netdev_linux_create_tap,
2448 netdev_tap_get_stats,
2449 NULL, /* set_stats */
2450 netdev_linux_get_features,
2451 netdev_linux_get_status);
2453 const struct netdev_class netdev_internal_class =
2456 netdev_linux_create,
2457 netdev_internal_get_stats,
2458 netdev_vport_set_stats,
2459 NULL, /* get_features */
2460 netdev_internal_get_status);
2462 /* HTB traffic control class. */
2464 #define HTB_N_QUEUES 0xf000
2468 unsigned int max_rate; /* In bytes/s. */
2472 struct tc_queue tc_queue;
2473 unsigned int min_rate; /* In bytes/s. */
2474 unsigned int max_rate; /* In bytes/s. */
2475 unsigned int burst; /* In bytes. */
2476 unsigned int priority; /* Lower values are higher priorities. */
2480 htb_get__(const struct netdev *netdev)
2482 struct netdev_dev_linux *netdev_dev =
2483 netdev_dev_linux_cast(netdev_get_dev(netdev));
2484 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2488 htb_install__(struct netdev *netdev, uint64_t max_rate)
2490 struct netdev_dev_linux *netdev_dev =
2491 netdev_dev_linux_cast(netdev_get_dev(netdev));
2494 htb = xmalloc(sizeof *htb);
2495 tc_init(&htb->tc, &tc_ops_htb);
2496 htb->max_rate = max_rate;
2498 netdev_dev->tc = &htb->tc;
2501 /* Create an HTB qdisc.
2503 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2505 htb_setup_qdisc__(struct netdev *netdev)
2508 struct tc_htb_glob opt;
2509 struct ofpbuf request;
2510 struct tcmsg *tcmsg;
2512 tc_del_qdisc(netdev);
2514 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2515 NLM_F_EXCL | NLM_F_CREATE, &request);
2519 tcmsg->tcm_handle = tc_make_handle(1, 0);
2520 tcmsg->tcm_parent = TC_H_ROOT;
2522 nl_msg_put_string(&request, TCA_KIND, "htb");
2524 memset(&opt, 0, sizeof opt);
2525 opt.rate2quantum = 10;
2529 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2530 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2531 nl_msg_end_nested(&request, opt_offset);
2533 return tc_transact(&request, NULL);
2536 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2537 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2539 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2540 unsigned int parent, struct htb_class *class)
2543 struct tc_htb_opt opt;
2544 struct ofpbuf request;
2545 struct tcmsg *tcmsg;
2549 error = netdev_get_mtu(netdev, &mtu);
2551 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2552 netdev_get_name(netdev));
2556 memset(&opt, 0, sizeof opt);
2557 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2558 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2559 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2560 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2561 opt.prio = class->priority;
2563 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2567 tcmsg->tcm_handle = handle;
2568 tcmsg->tcm_parent = parent;
2570 nl_msg_put_string(&request, TCA_KIND, "htb");
2571 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2572 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2573 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2574 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2575 nl_msg_end_nested(&request, opt_offset);
2577 error = tc_transact(&request, NULL);
2579 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2580 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2581 netdev_get_name(netdev),
2582 tc_get_major(handle), tc_get_minor(handle),
2583 tc_get_major(parent), tc_get_minor(parent),
2584 class->min_rate, class->max_rate,
2585 class->burst, class->priority, strerror(error));
2590 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2591 * description of them into 'details'. The description complies with the
2592 * specification given in the vswitch database documentation for linux-htb
2595 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2597 static const struct nl_policy tca_htb_policy[] = {
2598 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2599 .min_len = sizeof(struct tc_htb_opt) },
2602 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2603 const struct tc_htb_opt *htb;
2605 if (!nl_parse_nested(nl_options, tca_htb_policy,
2606 attrs, ARRAY_SIZE(tca_htb_policy))) {
2607 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2611 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2612 class->min_rate = htb->rate.rate;
2613 class->max_rate = htb->ceil.rate;
2614 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2615 class->priority = htb->prio;
2620 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2621 struct htb_class *options,
2622 struct netdev_queue_stats *stats)
2624 struct nlattr *nl_options;
2625 unsigned int handle;
2628 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2629 if (!error && queue_id) {
2630 unsigned int major = tc_get_major(handle);
2631 unsigned int minor = tc_get_minor(handle);
2632 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2633 *queue_id = minor - 1;
2638 if (!error && options) {
2639 error = htb_parse_tca_options__(nl_options, options);
2645 htb_parse_qdisc_details__(struct netdev *netdev,
2646 const struct shash *details, struct htb_class *hc)
2648 const char *max_rate_s;
2650 max_rate_s = shash_find_data(details, "max-rate");
2651 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2652 if (!hc->max_rate) {
2655 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2656 hc->max_rate = netdev_features_to_bps(current) / 8;
2658 hc->min_rate = hc->max_rate;
2664 htb_parse_class_details__(struct netdev *netdev,
2665 const struct shash *details, struct htb_class *hc)
2667 const struct htb *htb = htb_get__(netdev);
2668 const char *min_rate_s = shash_find_data(details, "min-rate");
2669 const char *max_rate_s = shash_find_data(details, "max-rate");
2670 const char *burst_s = shash_find_data(details, "burst");
2671 const char *priority_s = shash_find_data(details, "priority");
2674 error = netdev_get_mtu(netdev, &mtu);
2676 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2677 netdev_get_name(netdev));
2681 /* HTB requires at least an mtu sized min-rate to send any traffic even
2682 * on uncongested links. */
2683 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2684 hc->min_rate = MAX(hc->min_rate, mtu);
2685 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2688 hc->max_rate = (max_rate_s
2689 ? strtoull(max_rate_s, NULL, 10) / 8
2691 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2692 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2696 * According to hints in the documentation that I've read, it is important
2697 * that 'burst' be at least as big as the largest frame that might be
2698 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2699 * but having it a bit too small is a problem. Since netdev_get_mtu()
2700 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2701 * the MTU. We actually add 64, instead of 14, as a guard against
2702 * additional headers get tacked on somewhere that we're not aware of. */
2703 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2704 hc->burst = MAX(hc->burst, mtu + 64);
2707 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2713 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2714 unsigned int parent, struct htb_class *options,
2715 struct netdev_queue_stats *stats)
2717 struct ofpbuf *reply;
2720 error = tc_query_class(netdev, handle, parent, &reply);
2722 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2723 ofpbuf_delete(reply);
2729 htb_tc_install(struct netdev *netdev, const struct shash *details)
2733 error = htb_setup_qdisc__(netdev);
2735 struct htb_class hc;
2737 htb_parse_qdisc_details__(netdev, details, &hc);
2738 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2739 tc_make_handle(1, 0), &hc);
2741 htb_install__(netdev, hc.max_rate);
2747 static struct htb_class *
2748 htb_class_cast__(const struct tc_queue *queue)
2750 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2754 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2755 const struct htb_class *hc)
2757 struct htb *htb = htb_get__(netdev);
2758 size_t hash = hash_int(queue_id, 0);
2759 struct tc_queue *queue;
2760 struct htb_class *hcp;
2762 queue = tc_find_queue__(netdev, queue_id, hash);
2764 hcp = htb_class_cast__(queue);
2766 hcp = xmalloc(sizeof *hcp);
2767 queue = &hcp->tc_queue;
2768 queue->queue_id = queue_id;
2769 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2772 hcp->min_rate = hc->min_rate;
2773 hcp->max_rate = hc->max_rate;
2774 hcp->burst = hc->burst;
2775 hcp->priority = hc->priority;
2779 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2782 struct nl_dump dump;
2783 struct htb_class hc;
2785 /* Get qdisc options. */
2787 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2788 htb_install__(netdev, hc.max_rate);
2791 if (!start_queue_dump(netdev, &dump)) {
2794 while (nl_dump_next(&dump, &msg)) {
2795 unsigned int queue_id;
2797 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2798 htb_update_queue__(netdev, queue_id, &hc);
2801 nl_dump_done(&dump);
2807 htb_tc_destroy(struct tc *tc)
2809 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2810 struct htb_class *hc, *next;
2812 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2813 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2821 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2823 const struct htb *htb = htb_get__(netdev);
2824 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2829 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2831 struct htb_class hc;
2834 htb_parse_qdisc_details__(netdev, details, &hc);
2835 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2836 tc_make_handle(1, 0), &hc);
2838 htb_get__(netdev)->max_rate = hc.max_rate;
2844 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2845 const struct tc_queue *queue, struct shash *details)
2847 const struct htb_class *hc = htb_class_cast__(queue);
2849 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2850 if (hc->min_rate != hc->max_rate) {
2851 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2853 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2855 shash_add(details, "priority", xasprintf("%u", hc->priority));
2861 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2862 const struct shash *details)
2864 struct htb_class hc;
2867 error = htb_parse_class_details__(netdev, details, &hc);
2872 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2873 tc_make_handle(1, 0xfffe), &hc);
2878 htb_update_queue__(netdev, queue_id, &hc);
2883 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2885 struct htb_class *hc = htb_class_cast__(queue);
2886 struct htb *htb = htb_get__(netdev);
2889 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2891 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2898 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2899 struct netdev_queue_stats *stats)
2901 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2902 tc_make_handle(1, 0xfffe), NULL, stats);
2906 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2907 const struct ofpbuf *nlmsg,
2908 netdev_dump_queue_stats_cb *cb, void *aux)
2910 struct netdev_queue_stats stats;
2911 unsigned int handle, major, minor;
2914 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2919 major = tc_get_major(handle);
2920 minor = tc_get_minor(handle);
2921 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2922 (*cb)(minor - 1, &stats, aux);
2927 static const struct tc_ops tc_ops_htb = {
2928 "htb", /* linux_name */
2929 "linux-htb", /* ovs_name */
2930 HTB_N_QUEUES, /* n_queues */
2939 htb_class_get_stats,
2940 htb_class_dump_stats
2943 /* "linux-hfsc" traffic control class. */
2945 #define HFSC_N_QUEUES 0xf000
2953 struct tc_queue tc_queue;
2958 static struct hfsc *
2959 hfsc_get__(const struct netdev *netdev)
2961 struct netdev_dev_linux *netdev_dev;
2962 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2963 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2966 static struct hfsc_class *
2967 hfsc_class_cast__(const struct tc_queue *queue)
2969 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2973 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2975 struct netdev_dev_linux * netdev_dev;
2978 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2979 hfsc = xmalloc(sizeof *hfsc);
2980 tc_init(&hfsc->tc, &tc_ops_hfsc);
2981 hfsc->max_rate = max_rate;
2982 netdev_dev->tc = &hfsc->tc;
2986 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2987 const struct hfsc_class *hc)
2991 struct hfsc_class *hcp;
2992 struct tc_queue *queue;
2994 hfsc = hfsc_get__(netdev);
2995 hash = hash_int(queue_id, 0);
2997 queue = tc_find_queue__(netdev, queue_id, hash);
2999 hcp = hfsc_class_cast__(queue);
3001 hcp = xmalloc(sizeof *hcp);
3002 queue = &hcp->tc_queue;
3003 queue->queue_id = queue_id;
3004 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3007 hcp->min_rate = hc->min_rate;
3008 hcp->max_rate = hc->max_rate;
3012 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3014 const struct tc_service_curve *rsc, *fsc, *usc;
3015 static const struct nl_policy tca_hfsc_policy[] = {
3017 .type = NL_A_UNSPEC,
3019 .min_len = sizeof(struct tc_service_curve),
3022 .type = NL_A_UNSPEC,
3024 .min_len = sizeof(struct tc_service_curve),
3027 .type = NL_A_UNSPEC,
3029 .min_len = sizeof(struct tc_service_curve),
3032 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3034 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3035 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3036 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3040 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3041 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3042 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3044 if (rsc->m1 != 0 || rsc->d != 0 ||
3045 fsc->m1 != 0 || fsc->d != 0 ||
3046 usc->m1 != 0 || usc->d != 0) {
3047 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3048 "Non-linear service curves are not supported.");
3052 if (rsc->m2 != fsc->m2) {
3053 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3054 "Real-time service curves are not supported ");
3058 if (rsc->m2 > usc->m2) {
3059 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3060 "Min-rate service curve is greater than "
3061 "the max-rate service curve.");
3065 class->min_rate = fsc->m2;
3066 class->max_rate = usc->m2;
3071 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3072 struct hfsc_class *options,
3073 struct netdev_queue_stats *stats)
3076 unsigned int handle;
3077 struct nlattr *nl_options;
3079 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3085 unsigned int major, minor;
3087 major = tc_get_major(handle);
3088 minor = tc_get_minor(handle);
3089 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3090 *queue_id = minor - 1;
3097 error = hfsc_parse_tca_options__(nl_options, options);
3104 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3105 unsigned int parent, struct hfsc_class *options,
3106 struct netdev_queue_stats *stats)
3109 struct ofpbuf *reply;
3111 error = tc_query_class(netdev, handle, parent, &reply);
3116 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3117 ofpbuf_delete(reply);
3122 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3123 struct hfsc_class *class)
3126 const char *max_rate_s;
3128 max_rate_s = shash_find_data(details, "max-rate");
3129 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3134 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3135 max_rate = netdev_features_to_bps(current) / 8;
3138 class->min_rate = max_rate;
3139 class->max_rate = max_rate;
3143 hfsc_parse_class_details__(struct netdev *netdev,
3144 const struct shash *details,
3145 struct hfsc_class * class)
3147 const struct hfsc *hfsc;
3148 uint32_t min_rate, max_rate;
3149 const char *min_rate_s, *max_rate_s;
3151 hfsc = hfsc_get__(netdev);
3152 min_rate_s = shash_find_data(details, "min-rate");
3153 max_rate_s = shash_find_data(details, "max-rate");
3155 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3156 min_rate = MAX(min_rate, 1);
3157 min_rate = MIN(min_rate, hfsc->max_rate);
3159 max_rate = (max_rate_s
3160 ? strtoull(max_rate_s, NULL, 10) / 8
3162 max_rate = MAX(max_rate, min_rate);
3163 max_rate = MIN(max_rate, hfsc->max_rate);
3165 class->min_rate = min_rate;
3166 class->max_rate = max_rate;
3171 /* Create an HFSC qdisc.
3173 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3175 hfsc_setup_qdisc__(struct netdev * netdev)
3177 struct tcmsg *tcmsg;
3178 struct ofpbuf request;
3179 struct tc_hfsc_qopt opt;
3181 tc_del_qdisc(netdev);
3183 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3184 NLM_F_EXCL | NLM_F_CREATE, &request);
3190 tcmsg->tcm_handle = tc_make_handle(1, 0);
3191 tcmsg->tcm_parent = TC_H_ROOT;
3193 memset(&opt, 0, sizeof opt);
3196 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3197 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3199 return tc_transact(&request, NULL);
3202 /* Create an HFSC class.
3204 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3205 * sc rate <min_rate> ul rate <max_rate>" */
3207 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3208 unsigned int parent, struct hfsc_class *class)
3212 struct tcmsg *tcmsg;
3213 struct ofpbuf request;
3214 struct tc_service_curve min, max;
3216 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3222 tcmsg->tcm_handle = handle;
3223 tcmsg->tcm_parent = parent;
3227 min.m2 = class->min_rate;
3231 max.m2 = class->max_rate;
3233 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3234 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3235 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3236 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3237 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3238 nl_msg_end_nested(&request, opt_offset);
3240 error = tc_transact(&request, NULL);
3242 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3243 "min-rate %ubps, max-rate %ubps (%s)",
3244 netdev_get_name(netdev),
3245 tc_get_major(handle), tc_get_minor(handle),
3246 tc_get_major(parent), tc_get_minor(parent),
3247 class->min_rate, class->max_rate, strerror(error));
3254 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3257 struct hfsc_class class;
3259 error = hfsc_setup_qdisc__(netdev);
3265 hfsc_parse_qdisc_details__(netdev, details, &class);
3266 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3267 tc_make_handle(1, 0), &class);
3273 hfsc_install__(netdev, class.max_rate);
3278 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3281 struct nl_dump dump;
3282 struct hfsc_class hc;
3285 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3286 hfsc_install__(netdev, hc.max_rate);
3288 if (!start_queue_dump(netdev, &dump)) {
3292 while (nl_dump_next(&dump, &msg)) {
3293 unsigned int queue_id;
3295 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3296 hfsc_update_queue__(netdev, queue_id, &hc);
3300 nl_dump_done(&dump);
3305 hfsc_tc_destroy(struct tc *tc)
3308 struct hfsc_class *hc, *next;
3310 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3312 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3313 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3322 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3324 const struct hfsc *hfsc;
3325 hfsc = hfsc_get__(netdev);
3326 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3331 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3334 struct hfsc_class class;
3336 hfsc_parse_qdisc_details__(netdev, details, &class);
3337 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3338 tc_make_handle(1, 0), &class);
3341 hfsc_get__(netdev)->max_rate = class.max_rate;
3348 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3349 const struct tc_queue *queue, struct shash *details)
3351 const struct hfsc_class *hc;
3353 hc = hfsc_class_cast__(queue);
3354 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3355 if (hc->min_rate != hc->max_rate) {
3356 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3362 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3363 const struct shash *details)
3366 struct hfsc_class class;
3368 error = hfsc_parse_class_details__(netdev, details, &class);
3373 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3374 tc_make_handle(1, 0xfffe), &class);
3379 hfsc_update_queue__(netdev, queue_id, &class);
3384 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3388 struct hfsc_class *hc;
3390 hc = hfsc_class_cast__(queue);
3391 hfsc = hfsc_get__(netdev);
3393 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3395 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3402 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3403 struct netdev_queue_stats *stats)
3405 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3406 tc_make_handle(1, 0xfffe), NULL, stats);
3410 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3411 const struct ofpbuf *nlmsg,
3412 netdev_dump_queue_stats_cb *cb, void *aux)
3414 struct netdev_queue_stats stats;
3415 unsigned int handle, major, minor;
3418 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3423 major = tc_get_major(handle);
3424 minor = tc_get_minor(handle);
3425 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3426 (*cb)(minor - 1, &stats, aux);
3431 static const struct tc_ops tc_ops_hfsc = {
3432 "hfsc", /* linux_name */
3433 "linux-hfsc", /* ovs_name */
3434 HFSC_N_QUEUES, /* n_queues */
3435 hfsc_tc_install, /* tc_install */
3436 hfsc_tc_load, /* tc_load */
3437 hfsc_tc_destroy, /* tc_destroy */
3438 hfsc_qdisc_get, /* qdisc_get */
3439 hfsc_qdisc_set, /* qdisc_set */
3440 hfsc_class_get, /* class_get */
3441 hfsc_class_set, /* class_set */
3442 hfsc_class_delete, /* class_delete */
3443 hfsc_class_get_stats, /* class_get_stats */
3444 hfsc_class_dump_stats /* class_dump_stats */
3447 /* "linux-default" traffic control class.
3449 * This class represents the default, unnamed Linux qdisc. It corresponds to
3450 * the "" (empty string) QoS type in the OVS database. */
3453 default_install__(struct netdev *netdev)
3455 struct netdev_dev_linux *netdev_dev =
3456 netdev_dev_linux_cast(netdev_get_dev(netdev));
3457 static struct tc *tc;
3460 tc = xmalloc(sizeof *tc);
3461 tc_init(tc, &tc_ops_default);
3463 netdev_dev->tc = tc;
3467 default_tc_install(struct netdev *netdev,
3468 const struct shash *details OVS_UNUSED)
3470 default_install__(netdev);
3475 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3477 default_install__(netdev);
3481 static const struct tc_ops tc_ops_default = {
3482 NULL, /* linux_name */
3487 NULL, /* tc_destroy */
3488 NULL, /* qdisc_get */
3489 NULL, /* qdisc_set */
3490 NULL, /* class_get */
3491 NULL, /* class_set */
3492 NULL, /* class_delete */
3493 NULL, /* class_get_stats */
3494 NULL /* class_dump_stats */
3497 /* "linux-other" traffic control class.
3502 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3504 struct netdev_dev_linux *netdev_dev =
3505 netdev_dev_linux_cast(netdev_get_dev(netdev));
3506 static struct tc *tc;
3509 tc = xmalloc(sizeof *tc);
3510 tc_init(tc, &tc_ops_other);
3512 netdev_dev->tc = tc;
3516 static const struct tc_ops tc_ops_other = {
3517 NULL, /* linux_name */
3518 "linux-other", /* ovs_name */
3520 NULL, /* tc_install */
3522 NULL, /* tc_destroy */
3523 NULL, /* qdisc_get */
3524 NULL, /* qdisc_set */
3525 NULL, /* class_get */
3526 NULL, /* class_set */
3527 NULL, /* class_delete */
3528 NULL, /* class_get_stats */
3529 NULL /* class_dump_stats */
3532 /* Traffic control. */
3534 /* Number of kernel "tc" ticks per second. */
3535 static double ticks_per_s;
3537 /* Number of kernel "jiffies" per second. This is used for the purpose of
3538 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3539 * one jiffy's worth of data.
3541 * There are two possibilities here:
3543 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3544 * approximate range of 100 to 1024. That means that we really need to
3545 * make sure that the qdisc can buffer that much data.
3547 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3548 * has finely granular timers and there's no need to fudge additional room
3549 * for buffers. (There's no extra effort needed to implement that: the
3550 * large 'buffer_hz' is used as a divisor, so practically any number will
3551 * come out as 0 in the division. Small integer results in the case of
3552 * really high dividends won't have any real effect anyhow.)
3554 static unsigned int buffer_hz;
3556 /* Returns tc handle 'major':'minor'. */
3558 tc_make_handle(unsigned int major, unsigned int minor)
3560 return TC_H_MAKE(major << 16, minor);
3563 /* Returns the major number from 'handle'. */
3565 tc_get_major(unsigned int handle)
3567 return TC_H_MAJ(handle) >> 16;
3570 /* Returns the minor number from 'handle'. */
3572 tc_get_minor(unsigned int handle)
3574 return TC_H_MIN(handle);
3577 static struct tcmsg *
3578 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3579 struct ofpbuf *request)
3581 struct tcmsg *tcmsg;
3585 error = get_ifindex(netdev, &ifindex);
3590 ofpbuf_init(request, 512);
3591 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3592 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3593 tcmsg->tcm_family = AF_UNSPEC;
3594 tcmsg->tcm_ifindex = ifindex;
3595 /* Caller should fill in tcmsg->tcm_handle. */
3596 /* Caller should fill in tcmsg->tcm_parent. */
3602 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3604 int error = nl_sock_transact(rtnl_sock, request, replyp);
3605 ofpbuf_uninit(request);
3609 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3610 * policing configuration.
3612 * This function is equivalent to running the following when 'add' is true:
3613 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3615 * This function is equivalent to running the following when 'add' is false:
3616 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3618 * The configuration and stats may be seen with the following command:
3619 * /sbin/tc -s qdisc show dev <devname>
3621 * Returns 0 if successful, otherwise a positive errno value.
3624 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3626 struct ofpbuf request;
3627 struct tcmsg *tcmsg;
3629 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3630 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3632 tcmsg = tc_make_request(netdev, type, flags, &request);
3636 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3637 tcmsg->tcm_parent = TC_H_INGRESS;
3638 nl_msg_put_string(&request, TCA_KIND, "ingress");
3639 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3641 error = tc_transact(&request, NULL);
3643 /* If we're deleting the qdisc, don't worry about some of the
3644 * error conditions. */
3645 if (!add && (error == ENOENT || error == EINVAL)) {
3654 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3657 * This function is equivalent to running:
3658 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3659 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3662 * The configuration and stats may be seen with the following command:
3663 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3665 * Returns 0 if successful, otherwise a positive errno value.
3668 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3670 struct tc_police tc_police;
3671 struct ofpbuf request;
3672 struct tcmsg *tcmsg;
3673 size_t basic_offset;
3674 size_t police_offset;
3678 memset(&tc_police, 0, sizeof tc_police);
3679 tc_police.action = TC_POLICE_SHOT;
3680 tc_police.mtu = mtu;
3681 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3682 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3683 kbits_burst * 1024);
3685 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3686 NLM_F_EXCL | NLM_F_CREATE, &request);
3690 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3691 tcmsg->tcm_info = tc_make_handle(49,
3692 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3694 nl_msg_put_string(&request, TCA_KIND, "basic");
3695 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3696 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3697 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3698 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3699 nl_msg_end_nested(&request, police_offset);
3700 nl_msg_end_nested(&request, basic_offset);
3702 error = tc_transact(&request, NULL);
3713 /* The values in psched are not individually very meaningful, but they are
3714 * important. The tables below show some values seen in the wild.
3718 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3719 * (Before that, there are hints that it was 1000000000.)
3721 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3725 * -----------------------------------
3726 * [1] 000c8000 000f4240 000f4240 00000064
3727 * [2] 000003e8 00000400 000f4240 3b9aca00
3728 * [3] 000003e8 00000400 000f4240 3b9aca00
3729 * [4] 000003e8 00000400 000f4240 00000064
3730 * [5] 000003e8 00000040 000f4240 3b9aca00
3731 * [6] 000003e8 00000040 000f4240 000000f9
3733 * a b c d ticks_per_s buffer_hz
3734 * ------- --------- ---------- ------------- ----------- -------------
3735 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3736 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3737 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3738 * [4] 1,000 1,024 1,000,000 100 976,562 100
3739 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3740 * [6] 1,000 64 1,000,000 249 15,625,000 249
3742 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3743 * [2] 2.6.26-1-686-bigmem from Debian lenny
3744 * [3] 2.6.26-2-sparc64 from Debian lenny
3745 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3746 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3747 * [6] 2.6.34 from kernel.org on KVM
3749 static const char fn[] = "/proc/net/psched";
3750 unsigned int a, b, c, d;
3756 stream = fopen(fn, "r");
3758 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3762 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3763 VLOG_WARN("%s: read failed", fn);
3767 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3771 VLOG_WARN("%s: invalid scheduler parameters", fn);
3775 ticks_per_s = (double) a * c / b;
3779 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3782 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3785 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3786 * rate of 'rate' bytes per second. */
3788 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3793 return (rate * ticks) / ticks_per_s;
3796 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3797 * rate of 'rate' bytes per second. */
3799 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3804 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3807 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3808 * a transmission rate of 'rate' bytes per second. */
3810 tc_buffer_per_jiffy(unsigned int rate)
3815 return rate / buffer_hz;
3818 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3819 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3820 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3821 * stores NULL into it if it is absent.
3823 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3826 * Returns 0 if successful, otherwise a positive errno value. */
3828 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3829 struct nlattr **options)
3831 static const struct nl_policy tca_policy[] = {
3832 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3833 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3835 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3837 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3838 tca_policy, ta, ARRAY_SIZE(ta))) {
3839 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3844 *kind = nl_attr_get_string(ta[TCA_KIND]);
3848 *options = ta[TCA_OPTIONS];
3863 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3864 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3865 * into '*options', and its queue statistics into '*stats'. Any of the output
3866 * arguments may be null.
3868 * Returns 0 if successful, otherwise a positive errno value. */
3870 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3871 struct nlattr **options, struct netdev_queue_stats *stats)
3873 static const struct nl_policy tca_policy[] = {
3874 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3875 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3877 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3879 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3880 tca_policy, ta, ARRAY_SIZE(ta))) {
3881 VLOG_WARN_RL(&rl, "failed to parse class message");
3886 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3887 *handlep = tc->tcm_handle;
3891 *options = ta[TCA_OPTIONS];
3895 const struct gnet_stats_queue *gsq;
3896 struct gnet_stats_basic gsb;
3898 static const struct nl_policy stats_policy[] = {
3899 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3900 .min_len = sizeof gsb },
3901 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3902 .min_len = sizeof *gsq },
3904 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3906 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3907 sa, ARRAY_SIZE(sa))) {
3908 VLOG_WARN_RL(&rl, "failed to parse class stats");
3912 /* Alignment issues screw up the length of struct gnet_stats_basic on
3913 * some arch/bitsize combinations. Newer versions of Linux have a
3914 * struct gnet_stats_basic_packed, but we can't depend on that. The
3915 * easiest thing to do is just to make a copy. */
3916 memset(&gsb, 0, sizeof gsb);
3917 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3918 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3919 stats->tx_bytes = gsb.bytes;
3920 stats->tx_packets = gsb.packets;
3922 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3923 stats->tx_errors = gsq->drops;
3933 memset(stats, 0, sizeof *stats);
3938 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3941 tc_query_class(const struct netdev *netdev,
3942 unsigned int handle, unsigned int parent,
3943 struct ofpbuf **replyp)
3945 struct ofpbuf request;
3946 struct tcmsg *tcmsg;
3949 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3953 tcmsg->tcm_handle = handle;
3954 tcmsg->tcm_parent = parent;
3956 error = tc_transact(&request, replyp);
3958 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3959 netdev_get_name(netdev),
3960 tc_get_major(handle), tc_get_minor(handle),
3961 tc_get_major(parent), tc_get_minor(parent),
3967 /* Equivalent to "tc class del dev <name> handle <handle>". */
3969 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3971 struct ofpbuf request;
3972 struct tcmsg *tcmsg;
3975 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3979 tcmsg->tcm_handle = handle;
3980 tcmsg->tcm_parent = 0;
3982 error = tc_transact(&request, NULL);
3984 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3985 netdev_get_name(netdev),
3986 tc_get_major(handle), tc_get_minor(handle),
3992 /* Equivalent to "tc qdisc del dev <name> root". */
3994 tc_del_qdisc(struct netdev *netdev)
3996 struct netdev_dev_linux *netdev_dev =
3997 netdev_dev_linux_cast(netdev_get_dev(netdev));
3998 struct ofpbuf request;
3999 struct tcmsg *tcmsg;
4002 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4006 tcmsg->tcm_handle = tc_make_handle(1, 0);
4007 tcmsg->tcm_parent = TC_H_ROOT;
4009 error = tc_transact(&request, NULL);
4010 if (error == EINVAL) {
4011 /* EINVAL probably means that the default qdisc was in use, in which
4012 * case we've accomplished our purpose. */
4015 if (!error && netdev_dev->tc) {
4016 if (netdev_dev->tc->ops->tc_destroy) {
4017 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4019 netdev_dev->tc = NULL;
4024 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4025 * kernel to determine what they are. Returns 0 if successful, otherwise a
4026 * positive errno value. */
4028 tc_query_qdisc(const struct netdev *netdev)
4030 struct netdev_dev_linux *netdev_dev =
4031 netdev_dev_linux_cast(netdev_get_dev(netdev));
4032 struct ofpbuf request, *qdisc;
4033 const struct tc_ops *ops;
4034 struct tcmsg *tcmsg;
4038 if (netdev_dev->tc) {
4042 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4043 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4044 * 2.6.35 without that fix backported to it.
4046 * To avoid the OOPS, we must not make a request that would attempt to dump
4047 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4048 * few others. There are a few ways that I can see to do this, but most of
4049 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4050 * technique chosen here is to assume that any non-default qdisc that we
4051 * create will have a class with handle 1:0. The built-in qdiscs only have
4052 * a class with handle 0:0.
4054 * We could check for Linux 2.6.35+ and use a more straightforward method
4056 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4060 tcmsg->tcm_handle = tc_make_handle(1, 0);
4061 tcmsg->tcm_parent = 0;
4063 /* Figure out what tc class to instantiate. */
4064 error = tc_transact(&request, &qdisc);
4068 error = tc_parse_qdisc(qdisc, &kind, NULL);
4070 ops = &tc_ops_other;
4072 ops = tc_lookup_linux_name(kind);
4074 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4075 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4077 ops = &tc_ops_other;
4080 } else if (error == ENOENT) {
4081 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4082 * other entity that doesn't have a handle 1:0. We will assume
4083 * that it's the system default qdisc. */
4084 ops = &tc_ops_default;
4087 /* Who knows? Maybe the device got deleted. */
4088 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4089 netdev_get_name(netdev), strerror(error));
4090 ops = &tc_ops_other;
4093 /* Instantiate it. */
4094 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4095 assert((load_error == 0) == (netdev_dev->tc != NULL));
4096 ofpbuf_delete(qdisc);
4098 return error ? error : load_error;
4101 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4102 approximate the time to transmit packets of various lengths. For an MTU of
4103 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4104 represents two possible packet lengths; for a MTU of 513 through 1024, four
4105 possible lengths; and so on.
4107 Returns, for the specified 'mtu', the number of bits that packet lengths
4108 need to be shifted right to fit within such a 256-entry table. */
4110 tc_calc_cell_log(unsigned int mtu)
4115 mtu = ETH_PAYLOAD_MAX;
4117 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4119 for (cell_log = 0; mtu >= 256; cell_log++) {
4126 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4129 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4131 memset(rate, 0, sizeof *rate);
4132 rate->cell_log = tc_calc_cell_log(mtu);
4133 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4134 /* rate->cell_align = 0; */ /* distro headers. */
4135 rate->mpu = ETH_TOTAL_MIN;
4139 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4140 * attribute of the specified "type".
4142 * See tc_calc_cell_log() above for a description of "rtab"s. */
4144 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4149 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4150 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4151 unsigned packet_size = (i + 1) << rate->cell_log;
4152 if (packet_size < rate->mpu) {
4153 packet_size = rate->mpu;
4155 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4159 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4160 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4161 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4164 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4166 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4167 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4170 /* Linux-only functions declared in netdev-linux.h */
4172 /* Returns a fd for an AF_INET socket or a negative errno value. */
4174 netdev_linux_get_af_inet_sock(void)
4176 int error = netdev_linux_init();
4177 return error ? -error : af_inet_sock;
4180 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4181 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4183 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4184 const char *flag_name, bool enable)
4186 const char *netdev_name = netdev_get_name(netdev);
4187 struct ethtool_value evalue;
4191 memset(&evalue, 0, sizeof evalue);
4192 error = netdev_linux_do_ethtool(netdev_name,
4193 (struct ethtool_cmd *)&evalue,
4194 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4199 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4200 error = netdev_linux_do_ethtool(netdev_name,
4201 (struct ethtool_cmd *)&evalue,
4202 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4207 memset(&evalue, 0, sizeof evalue);
4208 error = netdev_linux_do_ethtool(netdev_name,
4209 (struct ethtool_cmd *)&evalue,
4210 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4215 if (new_flags != evalue.data) {
4216 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4217 "device %s failed", enable ? "enable" : "disable",
4218 flag_name, netdev_name);
4225 /* Utility functions. */
4227 /* Copies 'src' into 'dst', performing format conversion in the process. */
4229 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4230 const struct rtnl_link_stats *src)
4232 dst->rx_packets = src->rx_packets;
4233 dst->tx_packets = src->tx_packets;
4234 dst->rx_bytes = src->rx_bytes;
4235 dst->tx_bytes = src->tx_bytes;
4236 dst->rx_errors = src->rx_errors;
4237 dst->tx_errors = src->tx_errors;
4238 dst->rx_dropped = src->rx_dropped;
4239 dst->tx_dropped = src->tx_dropped;
4240 dst->multicast = src->multicast;
4241 dst->collisions = src->collisions;
4242 dst->rx_length_errors = src->rx_length_errors;
4243 dst->rx_over_errors = src->rx_over_errors;
4244 dst->rx_crc_errors = src->rx_crc_errors;
4245 dst->rx_frame_errors = src->rx_frame_errors;
4246 dst->rx_fifo_errors = src->rx_fifo_errors;
4247 dst->rx_missed_errors = src->rx_missed_errors;
4248 dst->tx_aborted_errors = src->tx_aborted_errors;
4249 dst->tx_carrier_errors = src->tx_carrier_errors;
4250 dst->tx_fifo_errors = src->tx_fifo_errors;
4251 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4252 dst->tx_window_errors = src->tx_window_errors;
4256 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4258 /* Policy for RTNLGRP_LINK messages.
4260 * There are *many* more fields in these messages, but currently we only
4261 * care about these fields. */
4262 static const struct nl_policy rtnlgrp_link_policy[] = {
4263 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4264 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4265 .min_len = sizeof(struct rtnl_link_stats) },
4268 struct ofpbuf request;
4269 struct ofpbuf *reply;
4270 struct ifinfomsg *ifi;
4271 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4274 ofpbuf_init(&request, 0);
4275 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4276 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4277 ifi->ifi_family = PF_UNSPEC;
4278 ifi->ifi_index = ifindex;
4279 error = nl_sock_transact(rtnl_sock, &request, &reply);
4280 ofpbuf_uninit(&request);
4285 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4286 rtnlgrp_link_policy,
4287 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4288 ofpbuf_delete(reply);
4292 if (!attrs[IFLA_STATS]) {
4293 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4294 ofpbuf_delete(reply);
4298 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4300 ofpbuf_delete(reply);
4306 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4308 static const char fn[] = "/proc/net/dev";
4313 stream = fopen(fn, "r");
4315 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4320 while (fgets(line, sizeof line, stream)) {
4323 #define X64 "%"SCNu64
4326 X64 X64 X64 X64 X64 X64 X64 "%*u"
4327 X64 X64 X64 X64 X64 X64 X64 "%*u",
4333 &stats->rx_fifo_errors,
4334 &stats->rx_frame_errors,
4340 &stats->tx_fifo_errors,
4342 &stats->tx_carrier_errors) != 15) {
4343 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4344 } else if (!strcmp(devname, netdev_name)) {
4345 stats->rx_length_errors = UINT64_MAX;
4346 stats->rx_over_errors = UINT64_MAX;
4347 stats->rx_crc_errors = UINT64_MAX;
4348 stats->rx_missed_errors = UINT64_MAX;
4349 stats->tx_aborted_errors = UINT64_MAX;
4350 stats->tx_heartbeat_errors = UINT64_MAX;
4351 stats->tx_window_errors = UINT64_MAX;
4357 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4363 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4369 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4372 *flags = ifr.ifr_flags;
4378 set_flags(struct netdev *netdev, unsigned int flags)
4382 ifr.ifr_flags = flags;
4383 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4388 do_get_ifindex(const char *netdev_name)
4392 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4393 COVERAGE_INC(netdev_get_ifindex);
4394 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4395 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4396 netdev_name, strerror(errno));
4399 return ifr.ifr_ifindex;
4403 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4405 struct netdev_dev_linux *netdev_dev =
4406 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4408 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4409 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4413 netdev_dev->cache_valid |= VALID_IFINDEX;
4414 netdev_dev->ifindex = ifindex;
4416 *ifindexp = netdev_dev->ifindex;
4421 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4426 memset(&ifr, 0, sizeof ifr);
4427 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4428 COVERAGE_INC(netdev_get_hwaddr);
4429 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4430 /* ENODEV probably means that a vif disappeared asynchronously and
4431 * hasn't been removed from the database yet, so reduce the log level
4432 * to INFO for that case. */
4433 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4434 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4435 netdev_name, strerror(errno));
4438 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4439 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4440 VLOG_WARN("%s device has unknown hardware address family %d",
4441 netdev_name, hwaddr_family);
4443 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4448 set_etheraddr(const char *netdev_name,
4449 const uint8_t mac[ETH_ADDR_LEN])
4453 memset(&ifr, 0, sizeof ifr);
4454 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4455 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4456 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4457 COVERAGE_INC(netdev_set_hwaddr);
4458 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4459 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4460 netdev_name, strerror(errno));
4467 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4468 int cmd, const char *cmd_name)
4472 memset(&ifr, 0, sizeof ifr);
4473 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4474 ifr.ifr_data = (caddr_t) ecmd;
4477 COVERAGE_INC(netdev_ethtool);
4478 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4481 if (errno != EOPNOTSUPP) {
4482 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4483 "failed: %s", cmd_name, name, strerror(errno));
4485 /* The device doesn't support this operation. That's pretty
4486 * common, so there's no point in logging anything. */
4493 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4494 const char *cmd_name)
4496 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4497 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4498 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4506 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4507 int cmd, const char *cmd_name)
4512 ifr.ifr_addr.sa_family = AF_INET;
4513 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4515 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4516 *ip = sin->sin_addr;
4521 /* Returns an AF_PACKET raw socket or a negative errno value. */
4523 af_packet_sock(void)
4525 static int sock = INT_MIN;
4527 if (sock == INT_MIN) {
4528 sock = socket(AF_PACKET, SOCK_RAW, 0);
4530 set_nonblocking(sock);
4533 VLOG_ERR("failed to create packet socket: %s", strerror(errno));