2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <linux/version.h>
37 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dpif-linux.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink-link.h"
67 #include "socket-util.h"
73 VLOG_DEFINE_THIS_MODULE(netdev_linux);
75 COVERAGE_DEFINE(netdev_set_policing);
76 COVERAGE_DEFINE(netdev_arp_lookup);
77 COVERAGE_DEFINE(netdev_get_ifindex);
78 COVERAGE_DEFINE(netdev_get_hwaddr);
79 COVERAGE_DEFINE(netdev_set_hwaddr);
80 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_VPORT_STAT_ERROR = 1 << 6,
118 VALID_DRVINFO = 1 << 7,
119 VALID_FEATURES = 1 << 8,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct shash *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_dev_linux {
356 struct netdev_dev netdev_dev;
358 struct shash_node *shash_node;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
380 int ether_addr_error; /* Cached error code from set/get etheraddr. */
381 int netdev_policing_error; /* Cached error code from set policing. */
382 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
383 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
385 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
390 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
394 struct tap_state tap;
398 struct netdev_linux {
399 struct netdev netdev;
403 /* Sockets used for ioctl operations. */
404 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
406 /* A Netlink routing socket that is not subscribed to any multicast groups. */
407 static struct nl_sock *rtnl_sock;
409 /* This is set pretty low because we probably won't learn anything from the
410 * additional log messages. */
411 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
413 static int netdev_linux_init(void);
415 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
416 int cmd, const char *cmd_name);
417 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
418 const char *cmd_name);
419 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
420 int cmd, const char *cmd_name);
421 static int get_flags(const struct netdev_dev *, unsigned int *flags);
422 static int set_flags(struct netdev *, unsigned int flags);
423 static int do_get_ifindex(const char *netdev_name);
424 static int get_ifindex(const struct netdev *, int *ifindexp);
425 static int do_set_addr(struct netdev *netdev,
426 int ioctl_nr, const char *ioctl_name,
427 struct in_addr addr);
428 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
429 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
430 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
431 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
432 static int af_packet_sock(void);
433 static void netdev_linux_miimon_run(void);
434 static void netdev_linux_miimon_wait(void);
437 is_netdev_linux_class(const struct netdev_class *netdev_class)
439 return netdev_class->init == netdev_linux_init;
442 static struct netdev_dev_linux *
443 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
445 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
446 assert(is_netdev_linux_class(netdev_class));
448 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
451 static struct netdev_linux *
452 netdev_linux_cast(const struct netdev *netdev)
454 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
455 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
456 assert(is_netdev_linux_class(netdev_class));
458 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
462 netdev_linux_init(void)
464 static int status = -1;
466 /* Create AF_INET socket. */
467 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
468 status = af_inet_sock >= 0 ? 0 : errno;
470 VLOG_ERR("failed to create inet socket: %s", strerror(status));
473 /* Create rtnetlink socket. */
475 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
477 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
486 netdev_linux_run(void)
488 rtnetlink_link_run();
489 netdev_linux_miimon_run();
493 netdev_linux_wait(void)
495 rtnetlink_link_wait();
496 netdev_linux_miimon_wait();
500 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
505 if (netdev_dev->cache_valid & VALID_DRVINFO) {
509 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
510 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
511 (struct ethtool_cmd *)&netdev_dev->drvinfo,
515 netdev_dev->cache_valid |= VALID_DRVINFO;
521 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
522 unsigned int ifi_flags,
526 if (!dev->change_seq) {
530 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
531 dev->carrier_resets++;
533 dev->ifi_flags = ifi_flags;
535 dev->cache_valid &= mask;
539 netdev_dev_linux_update(struct netdev_dev_linux *dev,
540 const struct rtnetlink_link_change *change)
542 if (change->nlmsg_type == RTM_NEWLINK) {
544 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
546 /* Update netdev from rtnl-change msg. */
548 dev->mtu = change->mtu;
549 dev->cache_valid |= VALID_MTU;
550 dev->netdev_mtu_error = 0;
553 if (!eth_addr_is_zero(change->addr)) {
554 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
555 dev->cache_valid |= VALID_ETHERADDR;
556 dev->ether_addr_error = 0;
559 dev->ifindex = change->ifi_index;
560 dev->cache_valid |= VALID_IFINDEX;
561 dev->get_ifindex_error = 0;
564 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
569 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
570 void *aux OVS_UNUSED)
572 struct netdev_dev_linux *dev;
574 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
576 const struct netdev_class *netdev_class =
577 netdev_dev_get_class(base_dev);
579 if (is_netdev_linux_class(netdev_class)) {
580 dev = netdev_dev_linux_cast(base_dev);
581 netdev_dev_linux_update(dev, change);
585 struct shash device_shash;
586 struct shash_node *node;
588 shash_init(&device_shash);
589 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
590 SHASH_FOR_EACH (node, &device_shash) {
595 get_flags(&dev->netdev_dev, &flags);
596 netdev_dev_linux_changed(dev, flags, 0);
598 shash_destroy(&device_shash);
603 cache_notifier_ref(void)
605 if (!cache_notifier_refcount) {
606 assert(!netdev_linux_cache_notifier);
608 netdev_linux_cache_notifier =
609 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
611 if (!netdev_linux_cache_notifier) {
615 cache_notifier_refcount++;
621 cache_notifier_unref(void)
623 assert(cache_notifier_refcount > 0);
624 if (!--cache_notifier_refcount) {
625 assert(netdev_linux_cache_notifier);
626 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
627 netdev_linux_cache_notifier = NULL;
631 /* Creates system and internal devices. */
633 netdev_linux_create(const struct netdev_class *class, const char *name,
634 struct netdev_dev **netdev_devp)
636 struct netdev_dev_linux *netdev_dev;
639 error = cache_notifier_ref();
644 netdev_dev = xzalloc(sizeof *netdev_dev);
645 netdev_dev->change_seq = 1;
646 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
647 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
649 *netdev_devp = &netdev_dev->netdev_dev;
653 /* For most types of netdevs we open the device for each call of
654 * netdev_open(). However, this is not the case with tap devices,
655 * since it is only possible to open the device once. In this
656 * situation we share a single file descriptor, and consequently
657 * buffers, across all readers. Therefore once data is read it will
658 * be unavailable to other reads for tap devices. */
660 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
661 const char *name, struct netdev_dev **netdev_devp)
663 struct netdev_dev_linux *netdev_dev;
664 struct tap_state *state;
665 static const char tap_dev[] = "/dev/net/tun";
669 netdev_dev = xzalloc(sizeof *netdev_dev);
670 state = &netdev_dev->state.tap;
672 error = cache_notifier_ref();
677 /* Open tap device. */
678 state->fd = open(tap_dev, O_RDWR);
681 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
682 goto error_unref_notifier;
685 /* Create tap device. */
686 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
687 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
688 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
689 VLOG_WARN("%s: creating tap device failed: %s", name,
692 goto error_unref_notifier;
695 /* Make non-blocking. */
696 error = set_nonblocking(state->fd);
698 goto error_unref_notifier;
701 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
702 *netdev_devp = &netdev_dev->netdev_dev;
705 error_unref_notifier:
706 cache_notifier_unref();
713 destroy_tap(struct netdev_dev_linux *netdev_dev)
715 struct tap_state *state = &netdev_dev->state.tap;
717 if (state->fd >= 0) {
722 /* Destroys the netdev device 'netdev_dev_'. */
724 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
726 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
727 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
729 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
730 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
733 if (class == &netdev_tap_class) {
734 destroy_tap(netdev_dev);
738 cache_notifier_unref();
742 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
744 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
745 struct netdev_linux *netdev;
746 enum netdev_flags flags;
749 /* Allocate network device. */
750 netdev = xzalloc(sizeof *netdev);
752 netdev_init(&netdev->netdev, netdev_dev_);
754 /* Verify that the device really exists, by attempting to read its flags.
755 * (The flags might be cached, in which case this won't actually do an
758 * Don't do this for "internal" netdevs, though, because those have to be
759 * created as netdev objects before they exist in the kernel, because
760 * creating them in the kernel happens by passing a netdev object to
761 * dpif_port_add(). */
762 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
763 error = netdev_get_flags(&netdev->netdev, &flags);
764 if (error == ENODEV) {
769 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
770 !netdev_dev->state.tap.opened) {
772 /* We assume that the first user of the tap device is the primary user
773 * and give them the tap FD. Subsequent users probably just expect
774 * this to be a system device so open it normally to avoid send/receive
775 * directions appearing to be reversed. */
776 netdev->fd = netdev_dev->state.tap.fd;
777 netdev_dev->state.tap.opened = true;
780 *netdevp = &netdev->netdev;
784 netdev_uninit(&netdev->netdev, true);
788 /* Closes and destroys 'netdev'. */
790 netdev_linux_close(struct netdev *netdev_)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
801 netdev_linux_listen(struct netdev *netdev_)
803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
804 struct sockaddr_ll sll;
809 if (netdev->fd >= 0) {
813 /* Create file descriptor. */
814 fd = socket(PF_PACKET, SOCK_RAW, 0);
817 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
821 /* Set non-blocking mode. */
822 error = set_nonblocking(fd);
827 /* Get ethernet device index. */
828 error = get_ifindex(&netdev->netdev, &ifindex);
833 /* Bind to specific ethernet device. */
834 memset(&sll, 0, sizeof sll);
835 sll.sll_family = AF_PACKET;
836 sll.sll_ifindex = ifindex;
837 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
838 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
840 VLOG_ERR("%s: failed to bind raw socket (%s)",
841 netdev_get_name(netdev_), strerror(error));
856 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
858 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
860 if (netdev->fd < 0) {
861 /* Device is not listening. */
868 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
869 ? read(netdev->fd, data, size)
870 : recv(netdev->fd, data, size, MSG_TRUNC));
872 return retval <= size ? retval : -EMSGSIZE;
873 } else if (errno != EINTR) {
874 if (errno != EAGAIN) {
875 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
876 strerror(errno), netdev_get_name(netdev_));
883 /* Registers with the poll loop to wake up from the next call to poll_block()
884 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
886 netdev_linux_recv_wait(struct netdev *netdev_)
888 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
889 if (netdev->fd >= 0) {
890 poll_fd_wait(netdev->fd, POLLIN);
894 /* Discards all packets waiting to be received from 'netdev'. */
896 netdev_linux_drain(struct netdev *netdev_)
898 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
899 if (netdev->fd < 0) {
901 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
903 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
904 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
908 drain_fd(netdev->fd, ifr.ifr_qlen);
911 return drain_rcvbuf(netdev->fd);
915 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
916 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
917 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
918 * the packet is too big or too small to transmit on the device.
920 * The caller retains ownership of 'buffer' in all cases.
922 * The kernel maintains a packet transmission queue, so the caller is not
923 * expected to do additional queuing of packets. */
925 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
927 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
931 if (netdev->fd < 0) {
932 /* Use our AF_PACKET socket to send to this device. */
933 struct sockaddr_ll sll;
940 sock = af_packet_sock();
945 error = get_ifindex(netdev_, &ifindex);
950 /* We don't bother setting most fields in sockaddr_ll because the
951 * kernel ignores them for SOCK_RAW. */
952 memset(&sll, 0, sizeof sll);
953 sll.sll_family = AF_PACKET;
954 sll.sll_ifindex = ifindex;
956 iov.iov_base = (void *) data;
960 msg.msg_namelen = sizeof sll;
963 msg.msg_control = NULL;
964 msg.msg_controllen = 0;
967 retval = sendmsg(sock, &msg, 0);
969 /* Use the netdev's own fd to send to this device. This is
970 * essential for tap devices, because packets sent to a tap device
971 * with an AF_PACKET socket will loop back to be *received* again
972 * on the tap device. */
973 retval = write(netdev->fd, data, size);
977 /* The Linux AF_PACKET implementation never blocks waiting for room
978 * for packets, instead returning ENOBUFS. Translate this into
979 * EAGAIN for the caller. */
980 if (errno == ENOBUFS) {
982 } else if (errno == EINTR) {
984 } else if (errno != EAGAIN) {
985 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
986 netdev_get_name(netdev_), strerror(errno));
989 } else if (retval != size) {
990 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
991 "%zu) on %s", retval, size, netdev_get_name(netdev_));
999 /* Registers with the poll loop to wake up from the next call to poll_block()
1000 * when the packet transmission queue has sufficient room to transmit a packet
1001 * with netdev_send().
1003 * The kernel maintains a packet transmission queue, so the client is not
1004 * expected to do additional queuing of packets. Thus, this function is
1005 * unlikely to ever be used. It is included for completeness. */
1007 netdev_linux_send_wait(struct netdev *netdev_)
1009 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1010 if (netdev->fd < 0) {
1011 /* Nothing to do. */
1012 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1013 poll_fd_wait(netdev->fd, POLLOUT);
1015 /* TAP device always accepts packets.*/
1016 poll_immediate_wake();
1020 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1021 * otherwise a positive errno value. */
1023 netdev_linux_set_etheraddr(struct netdev *netdev_,
1024 const uint8_t mac[ETH_ADDR_LEN])
1026 struct netdev_dev_linux *netdev_dev =
1027 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1030 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1031 if (netdev_dev->ether_addr_error) {
1032 return netdev_dev->ether_addr_error;
1034 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1037 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1040 error = set_etheraddr(netdev_get_name(netdev_), mac);
1041 if (!error || error == ENODEV) {
1042 netdev_dev->ether_addr_error = error;
1043 netdev_dev->cache_valid |= VALID_ETHERADDR;
1045 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1052 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1054 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1055 uint8_t mac[ETH_ADDR_LEN])
1057 struct netdev_dev_linux *netdev_dev =
1058 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1060 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1061 int error = get_etheraddr(netdev_get_name(netdev_),
1062 netdev_dev->etheraddr);
1064 netdev_dev->ether_addr_error = error;
1065 netdev_dev->cache_valid |= VALID_ETHERADDR;
1068 if (!netdev_dev->ether_addr_error) {
1069 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1072 return netdev_dev->ether_addr_error;
1075 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1076 * in bytes, not including the hardware header; thus, this is typically 1500
1077 * bytes for Ethernet devices. */
1079 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1081 struct netdev_dev_linux *netdev_dev =
1082 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1083 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1087 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1088 SIOCGIFMTU, "SIOCGIFMTU");
1090 netdev_dev->netdev_mtu_error = error;
1091 netdev_dev->mtu = ifr.ifr_mtu;
1092 netdev_dev->cache_valid |= VALID_MTU;
1095 if (!netdev_dev->netdev_mtu_error) {
1096 *mtup = netdev_dev->mtu;
1098 return netdev_dev->netdev_mtu_error;
1101 /* Sets the maximum size of transmitted (MTU) for given device using linux
1102 * networking ioctl interface.
1105 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1107 struct netdev_dev_linux *netdev_dev =
1108 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1112 if (netdev_dev->cache_valid & VALID_MTU) {
1113 if (netdev_dev->netdev_mtu_error) {
1114 return netdev_dev->netdev_mtu_error;
1116 if (netdev_dev->mtu == mtu) {
1119 netdev_dev->cache_valid &= ~VALID_MTU;
1122 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1123 SIOCSIFMTU, "SIOCSIFMTU");
1124 if (!error || error == ENODEV) {
1125 netdev_dev->netdev_mtu_error = error;
1126 netdev_dev->mtu = ifr.ifr_mtu;
1127 netdev_dev->cache_valid |= VALID_MTU;
1132 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1133 * On failure, returns a negative errno value. */
1135 netdev_linux_get_ifindex(const struct netdev *netdev)
1139 error = get_ifindex(netdev, &ifindex);
1140 return error ? -error : ifindex;
1144 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1146 struct netdev_dev_linux *netdev_dev =
1147 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1149 if (netdev_dev->miimon_interval > 0) {
1150 *carrier = netdev_dev->miimon;
1152 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1158 static long long int
1159 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1161 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1165 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1166 struct mii_ioctl_data *data)
1171 memset(&ifr, 0, sizeof ifr);
1172 memcpy(&ifr.ifr_data, data, sizeof *data);
1173 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1174 memcpy(data, &ifr.ifr_data, sizeof *data);
1180 netdev_linux_get_miimon(const char *name, bool *miimon)
1182 struct mii_ioctl_data data;
1187 memset(&data, 0, sizeof data);
1188 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1190 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1191 data.reg_num = MII_BMSR;
1192 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1196 *miimon = !!(data.val_out & BMSR_LSTATUS);
1198 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1201 struct ethtool_cmd ecmd;
1203 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1206 memset(&ecmd, 0, sizeof ecmd);
1207 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1210 struct ethtool_value eval;
1212 memcpy(&eval, &ecmd, sizeof eval);
1213 *miimon = !!eval.data;
1215 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1223 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1224 long long int interval)
1226 struct netdev_dev_linux *netdev_dev;
1228 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1230 interval = interval > 0 ? MAX(interval, 100) : 0;
1231 if (netdev_dev->miimon_interval != interval) {
1232 netdev_dev->miimon_interval = interval;
1233 timer_set_expired(&netdev_dev->miimon_timer);
1240 netdev_linux_miimon_run(void)
1242 struct shash device_shash;
1243 struct shash_node *node;
1245 shash_init(&device_shash);
1246 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1247 SHASH_FOR_EACH (node, &device_shash) {
1248 struct netdev_dev_linux *dev = node->data;
1251 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1255 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1256 if (miimon != dev->miimon) {
1257 dev->miimon = miimon;
1258 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1261 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1264 shash_destroy(&device_shash);
1268 netdev_linux_miimon_wait(void)
1270 struct shash device_shash;
1271 struct shash_node *node;
1273 shash_init(&device_shash);
1274 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1275 SHASH_FOR_EACH (node, &device_shash) {
1276 struct netdev_dev_linux *dev = node->data;
1278 if (dev->miimon_interval > 0) {
1279 timer_wait(&dev->miimon_timer);
1282 shash_destroy(&device_shash);
1285 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1286 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1289 check_for_working_netlink_stats(void)
1291 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1292 * preferable, so if that works, we'll use it. */
1293 int ifindex = do_get_ifindex("lo");
1295 VLOG_WARN("failed to get ifindex for lo, "
1296 "obtaining netdev stats from proc");
1299 struct netdev_stats stats;
1300 int error = get_stats_via_netlink(ifindex, &stats);
1302 VLOG_DBG("obtaining netdev stats via rtnetlink");
1305 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1306 "via proc (you are probably running a pre-2.6.19 "
1307 "kernel)", strerror(error));
1314 swap_uint64(uint64_t *a, uint64_t *b)
1322 get_stats_via_vport(const struct netdev *netdev_,
1323 struct netdev_stats *stats)
1325 struct netdev_dev_linux *netdev_dev =
1326 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1328 if (!netdev_dev->vport_stats_error ||
1329 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1332 error = netdev_vport_get_stats(netdev_, stats);
1334 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1335 "(%s)", netdev_get_name(netdev_), strerror(error));
1337 netdev_dev->vport_stats_error = error;
1338 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1343 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1344 struct netdev_stats *stats)
1346 static int use_netlink_stats = -1;
1349 if (use_netlink_stats < 0) {
1350 use_netlink_stats = check_for_working_netlink_stats();
1353 if (use_netlink_stats) {
1356 error = get_ifindex(netdev_, &ifindex);
1358 error = get_stats_via_netlink(ifindex, stats);
1361 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1365 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1366 netdev_get_name(netdev_), error);
1372 /* Retrieves current device stats for 'netdev-linux'. */
1374 netdev_linux_get_stats(const struct netdev *netdev_,
1375 struct netdev_stats *stats)
1377 struct netdev_dev_linux *netdev_dev =
1378 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1379 struct netdev_stats dev_stats;
1382 get_stats_via_vport(netdev_, stats);
1384 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1387 if (netdev_dev->vport_stats_error) {
1394 if (netdev_dev->vport_stats_error) {
1395 /* stats not available from OVS then use ioctl stats. */
1398 stats->rx_errors += dev_stats.rx_errors;
1399 stats->tx_errors += dev_stats.tx_errors;
1400 stats->rx_dropped += dev_stats.rx_dropped;
1401 stats->tx_dropped += dev_stats.tx_dropped;
1402 stats->multicast += dev_stats.multicast;
1403 stats->collisions += dev_stats.collisions;
1404 stats->rx_length_errors += dev_stats.rx_length_errors;
1405 stats->rx_over_errors += dev_stats.rx_over_errors;
1406 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1407 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1408 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1409 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1410 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1411 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1412 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1413 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1414 stats->tx_window_errors += dev_stats.tx_window_errors;
1419 /* Retrieves current device stats for 'netdev-tap' netdev or
1420 * netdev-internal. */
1422 netdev_tap_get_stats(const struct netdev *netdev_,
1423 struct netdev_stats *stats)
1425 struct netdev_dev_linux *netdev_dev =
1426 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1427 struct netdev_stats dev_stats;
1430 get_stats_via_vport(netdev_, stats);
1432 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1434 if (netdev_dev->vport_stats_error) {
1441 /* If this port is an internal port then the transmit and receive stats
1442 * will appear to be swapped relative to the other ports since we are the
1443 * one sending the data, not a remote computer. For consistency, we swap
1444 * them back here. This does not apply if we are getting stats from the
1445 * vport layer because it always tracks stats from the perspective of the
1447 if (netdev_dev->vport_stats_error) {
1449 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1450 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1451 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1452 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1453 stats->rx_length_errors = 0;
1454 stats->rx_over_errors = 0;
1455 stats->rx_crc_errors = 0;
1456 stats->rx_frame_errors = 0;
1457 stats->rx_fifo_errors = 0;
1458 stats->rx_missed_errors = 0;
1459 stats->tx_aborted_errors = 0;
1460 stats->tx_carrier_errors = 0;
1461 stats->tx_fifo_errors = 0;
1462 stats->tx_heartbeat_errors = 0;
1463 stats->tx_window_errors = 0;
1465 stats->rx_dropped += dev_stats.tx_dropped;
1466 stats->tx_dropped += dev_stats.rx_dropped;
1468 stats->rx_errors += dev_stats.tx_errors;
1469 stats->tx_errors += dev_stats.rx_errors;
1471 stats->multicast += dev_stats.multicast;
1472 stats->collisions += dev_stats.collisions;
1478 netdev_internal_get_stats(const struct netdev *netdev_,
1479 struct netdev_stats *stats)
1481 struct netdev_dev_linux *netdev_dev =
1482 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1484 get_stats_via_vport(netdev_, stats);
1485 return netdev_dev->vport_stats_error;
1489 netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
1491 struct ethtool_cmd ecmd;
1495 if (netdev_dev->cache_valid & VALID_FEATURES) {
1499 memset(&ecmd, 0, sizeof ecmd);
1500 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
1501 ETHTOOL_GSET, "ETHTOOL_GSET");
1506 /* Supported features. */
1507 netdev_dev->supported = 0;
1508 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1509 netdev_dev->supported |= NETDEV_F_10MB_HD;
1511 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1512 netdev_dev->supported |= NETDEV_F_10MB_FD;
1514 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1515 netdev_dev->supported |= NETDEV_F_100MB_HD;
1517 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1518 netdev_dev->supported |= NETDEV_F_100MB_FD;
1520 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1521 netdev_dev->supported |= NETDEV_F_1GB_HD;
1523 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1524 netdev_dev->supported |= NETDEV_F_1GB_FD;
1526 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1527 netdev_dev->supported |= NETDEV_F_10GB_FD;
1529 if (ecmd.supported & SUPPORTED_TP) {
1530 netdev_dev->supported |= NETDEV_F_COPPER;
1532 if (ecmd.supported & SUPPORTED_FIBRE) {
1533 netdev_dev->supported |= NETDEV_F_FIBER;
1535 if (ecmd.supported & SUPPORTED_Autoneg) {
1536 netdev_dev->supported |= NETDEV_F_AUTONEG;
1538 if (ecmd.supported & SUPPORTED_Pause) {
1539 netdev_dev->supported |= NETDEV_F_PAUSE;
1541 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1542 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
1545 /* Advertised features. */
1546 netdev_dev->advertised = 0;
1547 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1548 netdev_dev->advertised |= NETDEV_F_10MB_HD;
1550 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1551 netdev_dev->advertised |= NETDEV_F_10MB_FD;
1553 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1554 netdev_dev->advertised |= NETDEV_F_100MB_HD;
1556 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1557 netdev_dev->advertised |= NETDEV_F_100MB_FD;
1559 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1560 netdev_dev->advertised |= NETDEV_F_1GB_HD;
1562 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1563 netdev_dev->advertised |= NETDEV_F_1GB_FD;
1565 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1566 netdev_dev->advertised |= NETDEV_F_10GB_FD;
1568 if (ecmd.advertising & ADVERTISED_TP) {
1569 netdev_dev->advertised |= NETDEV_F_COPPER;
1571 if (ecmd.advertising & ADVERTISED_FIBRE) {
1572 netdev_dev->advertised |= NETDEV_F_FIBER;
1574 if (ecmd.advertising & ADVERTISED_Autoneg) {
1575 netdev_dev->advertised |= NETDEV_F_AUTONEG;
1577 if (ecmd.advertising & ADVERTISED_Pause) {
1578 netdev_dev->advertised |= NETDEV_F_PAUSE;
1580 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1581 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
1584 /* Current settings. */
1586 if (speed == SPEED_10) {
1587 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1588 } else if (speed == SPEED_100) {
1589 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1590 } else if (speed == SPEED_1000) {
1591 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1592 } else if (speed == SPEED_10000) {
1593 netdev_dev->current = NETDEV_F_10GB_FD;
1594 } else if (speed == 40000) {
1595 netdev_dev->current = NETDEV_F_40GB_FD;
1596 } else if (speed == 100000) {
1597 netdev_dev->current = NETDEV_F_100GB_FD;
1598 } else if (speed == 1000000) {
1599 netdev_dev->current = NETDEV_F_1TB_FD;
1601 netdev_dev->current = 0;
1604 if (ecmd.port == PORT_TP) {
1605 netdev_dev->current |= NETDEV_F_COPPER;
1606 } else if (ecmd.port == PORT_FIBRE) {
1607 netdev_dev->current |= NETDEV_F_FIBER;
1611 netdev_dev->current |= NETDEV_F_AUTONEG;
1614 /* Peer advertisements. */
1615 netdev_dev->peer = 0; /* XXX */
1618 netdev_dev->cache_valid |= VALID_FEATURES;
1619 netdev_dev->get_features_error = error;
1622 /* Stores the features supported by 'netdev' into each of '*current',
1623 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1624 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1627 netdev_linux_get_features(const struct netdev *netdev_,
1628 enum netdev_features *current,
1629 enum netdev_features *advertised,
1630 enum netdev_features *supported,
1631 enum netdev_features *peer)
1633 struct netdev_dev_linux *netdev_dev =
1634 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1636 netdev_linux_read_features(netdev_dev);
1638 if (!netdev_dev->get_features_error) {
1639 *current = netdev_dev->current;
1640 *advertised = netdev_dev->advertised;
1641 *supported = netdev_dev->supported;
1642 *peer = netdev_dev->peer;
1644 return netdev_dev->get_features_error;
1647 /* Set the features advertised by 'netdev' to 'advertise'. */
1649 netdev_linux_set_advertisements(struct netdev *netdev,
1650 enum netdev_features advertise)
1652 struct ethtool_cmd ecmd;
1655 memset(&ecmd, 0, sizeof ecmd);
1656 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1657 ETHTOOL_GSET, "ETHTOOL_GSET");
1662 ecmd.advertising = 0;
1663 if (advertise & NETDEV_F_10MB_HD) {
1664 ecmd.advertising |= ADVERTISED_10baseT_Half;
1666 if (advertise & NETDEV_F_10MB_FD) {
1667 ecmd.advertising |= ADVERTISED_10baseT_Full;
1669 if (advertise & NETDEV_F_100MB_HD) {
1670 ecmd.advertising |= ADVERTISED_100baseT_Half;
1672 if (advertise & NETDEV_F_100MB_FD) {
1673 ecmd.advertising |= ADVERTISED_100baseT_Full;
1675 if (advertise & NETDEV_F_1GB_HD) {
1676 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1678 if (advertise & NETDEV_F_1GB_FD) {
1679 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1681 if (advertise & NETDEV_F_10GB_FD) {
1682 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1684 if (advertise & NETDEV_F_COPPER) {
1685 ecmd.advertising |= ADVERTISED_TP;
1687 if (advertise & NETDEV_F_FIBER) {
1688 ecmd.advertising |= ADVERTISED_FIBRE;
1690 if (advertise & NETDEV_F_AUTONEG) {
1691 ecmd.advertising |= ADVERTISED_Autoneg;
1693 if (advertise & NETDEV_F_PAUSE) {
1694 ecmd.advertising |= ADVERTISED_Pause;
1696 if (advertise & NETDEV_F_PAUSE_ASYM) {
1697 ecmd.advertising |= ADVERTISED_Asym_Pause;
1699 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1700 ETHTOOL_SSET, "ETHTOOL_SSET");
1703 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1704 * successful, otherwise a positive errno value. */
1706 netdev_linux_set_policing(struct netdev *netdev,
1707 uint32_t kbits_rate, uint32_t kbits_burst)
1709 struct netdev_dev_linux *netdev_dev =
1710 netdev_dev_linux_cast(netdev_get_dev(netdev));
1711 const char *netdev_name = netdev_get_name(netdev);
1715 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1716 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1717 : kbits_burst); /* Stick with user-specified value. */
1719 if (netdev_dev->cache_valid & VALID_POLICING) {
1720 if (netdev_dev->netdev_policing_error) {
1721 return netdev_dev->netdev_policing_error;
1724 if (netdev_dev->kbits_rate == kbits_rate &&
1725 netdev_dev->kbits_burst == kbits_burst) {
1726 /* Assume that settings haven't changed since we last set them. */
1729 netdev_dev->cache_valid &= ~VALID_POLICING;
1732 COVERAGE_INC(netdev_set_policing);
1733 /* Remove any existing ingress qdisc. */
1734 error = tc_add_del_ingress_qdisc(netdev, false);
1736 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1737 netdev_name, strerror(error));
1742 error = tc_add_del_ingress_qdisc(netdev, true);
1744 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1745 netdev_name, strerror(error));
1749 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1751 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1752 netdev_name, strerror(error));
1757 netdev_dev->kbits_rate = kbits_rate;
1758 netdev_dev->kbits_burst = kbits_burst;
1761 if (!error || error == ENODEV) {
1762 netdev_dev->netdev_policing_error = error;
1763 netdev_dev->cache_valid |= VALID_POLICING;
1769 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1772 const struct tc_ops **opsp;
1774 for (opsp = tcs; *opsp != NULL; opsp++) {
1775 const struct tc_ops *ops = *opsp;
1776 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1777 sset_add(types, ops->ovs_name);
1783 static const struct tc_ops *
1784 tc_lookup_ovs_name(const char *name)
1786 const struct tc_ops **opsp;
1788 for (opsp = tcs; *opsp != NULL; opsp++) {
1789 const struct tc_ops *ops = *opsp;
1790 if (!strcmp(name, ops->ovs_name)) {
1797 static const struct tc_ops *
1798 tc_lookup_linux_name(const char *name)
1800 const struct tc_ops **opsp;
1802 for (opsp = tcs; *opsp != NULL; opsp++) {
1803 const struct tc_ops *ops = *opsp;
1804 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1811 static struct tc_queue *
1812 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1815 struct netdev_dev_linux *netdev_dev =
1816 netdev_dev_linux_cast(netdev_get_dev(netdev));
1817 struct tc_queue *queue;
1819 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1820 if (queue->queue_id == queue_id) {
1827 static struct tc_queue *
1828 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1830 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1834 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1836 struct netdev_qos_capabilities *caps)
1838 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1842 caps->n_queues = ops->n_queues;
1847 netdev_linux_get_qos(const struct netdev *netdev,
1848 const char **typep, struct shash *details)
1850 struct netdev_dev_linux *netdev_dev =
1851 netdev_dev_linux_cast(netdev_get_dev(netdev));
1854 error = tc_query_qdisc(netdev);
1859 *typep = netdev_dev->tc->ops->ovs_name;
1860 return (netdev_dev->tc->ops->qdisc_get
1861 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1866 netdev_linux_set_qos(struct netdev *netdev,
1867 const char *type, const struct shash *details)
1869 struct netdev_dev_linux *netdev_dev =
1870 netdev_dev_linux_cast(netdev_get_dev(netdev));
1871 const struct tc_ops *new_ops;
1874 new_ops = tc_lookup_ovs_name(type);
1875 if (!new_ops || !new_ops->tc_install) {
1879 error = tc_query_qdisc(netdev);
1884 if (new_ops == netdev_dev->tc->ops) {
1885 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1887 /* Delete existing qdisc. */
1888 error = tc_del_qdisc(netdev);
1892 assert(netdev_dev->tc == NULL);
1894 /* Install new qdisc. */
1895 error = new_ops->tc_install(netdev, details);
1896 assert((error == 0) == (netdev_dev->tc != NULL));
1903 netdev_linux_get_queue(const struct netdev *netdev,
1904 unsigned int queue_id, struct shash *details)
1906 struct netdev_dev_linux *netdev_dev =
1907 netdev_dev_linux_cast(netdev_get_dev(netdev));
1910 error = tc_query_qdisc(netdev);
1914 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1916 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1922 netdev_linux_set_queue(struct netdev *netdev,
1923 unsigned int queue_id, const struct shash *details)
1925 struct netdev_dev_linux *netdev_dev =
1926 netdev_dev_linux_cast(netdev_get_dev(netdev));
1929 error = tc_query_qdisc(netdev);
1932 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1933 || !netdev_dev->tc->ops->class_set) {
1937 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1941 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1943 struct netdev_dev_linux *netdev_dev =
1944 netdev_dev_linux_cast(netdev_get_dev(netdev));
1947 error = tc_query_qdisc(netdev);
1950 } else if (!netdev_dev->tc->ops->class_delete) {
1953 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1955 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1961 netdev_linux_get_queue_stats(const struct netdev *netdev,
1962 unsigned int queue_id,
1963 struct netdev_queue_stats *stats)
1965 struct netdev_dev_linux *netdev_dev =
1966 netdev_dev_linux_cast(netdev_get_dev(netdev));
1969 error = tc_query_qdisc(netdev);
1972 } else if (!netdev_dev->tc->ops->class_get_stats) {
1975 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1977 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1983 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1985 struct ofpbuf request;
1986 struct tcmsg *tcmsg;
1988 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1992 tcmsg->tcm_parent = 0;
1993 nl_dump_start(dump, rtnl_sock, &request);
1994 ofpbuf_uninit(&request);
1999 netdev_linux_dump_queues(const struct netdev *netdev,
2000 netdev_dump_queues_cb *cb, void *aux)
2002 struct netdev_dev_linux *netdev_dev =
2003 netdev_dev_linux_cast(netdev_get_dev(netdev));
2004 struct tc_queue *queue, *next_queue;
2005 struct shash details;
2009 error = tc_query_qdisc(netdev);
2012 } else if (!netdev_dev->tc->ops->class_get) {
2017 shash_init(&details);
2018 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2019 &netdev_dev->tc->queues) {
2020 shash_clear(&details);
2022 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
2024 (*cb)(queue->queue_id, &details, aux);
2029 shash_destroy(&details);
2035 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2036 netdev_dump_queue_stats_cb *cb, void *aux)
2038 struct netdev_dev_linux *netdev_dev =
2039 netdev_dev_linux_cast(netdev_get_dev(netdev));
2040 struct nl_dump dump;
2045 error = tc_query_qdisc(netdev);
2048 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2053 if (!start_queue_dump(netdev, &dump)) {
2056 while (nl_dump_next(&dump, &msg)) {
2057 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2063 error = nl_dump_done(&dump);
2064 return error ? error : last_error;
2068 netdev_linux_get_in4(const struct netdev *netdev_,
2069 struct in_addr *address, struct in_addr *netmask)
2071 struct netdev_dev_linux *netdev_dev =
2072 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2074 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2077 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2078 SIOCGIFADDR, "SIOCGIFADDR");
2083 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2084 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2089 netdev_dev->cache_valid |= VALID_IN4;
2091 *address = netdev_dev->address;
2092 *netmask = netdev_dev->netmask;
2093 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2097 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2098 struct in_addr netmask)
2100 struct netdev_dev_linux *netdev_dev =
2101 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2104 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2106 netdev_dev->cache_valid |= VALID_IN4;
2107 netdev_dev->address = address;
2108 netdev_dev->netmask = netmask;
2109 if (address.s_addr != INADDR_ANY) {
2110 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2111 "SIOCSIFNETMASK", netmask);
2118 parse_if_inet6_line(const char *line,
2119 struct in6_addr *in6, char ifname[16 + 1])
2121 uint8_t *s6 = in6->s6_addr;
2122 #define X8 "%2"SCNx8
2124 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2125 "%*x %*x %*x %*x %16s\n",
2126 &s6[0], &s6[1], &s6[2], &s6[3],
2127 &s6[4], &s6[5], &s6[6], &s6[7],
2128 &s6[8], &s6[9], &s6[10], &s6[11],
2129 &s6[12], &s6[13], &s6[14], &s6[15],
2133 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2134 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2136 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2138 struct netdev_dev_linux *netdev_dev =
2139 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2140 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2144 netdev_dev->in6 = in6addr_any;
2146 file = fopen("/proc/net/if_inet6", "r");
2148 const char *name = netdev_get_name(netdev_);
2149 while (fgets(line, sizeof line, file)) {
2150 struct in6_addr in6_tmp;
2151 char ifname[16 + 1];
2152 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2153 && !strcmp(name, ifname))
2155 netdev_dev->in6 = in6_tmp;
2161 netdev_dev->cache_valid |= VALID_IN6;
2163 *in6 = netdev_dev->in6;
2168 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2170 struct sockaddr_in sin;
2171 memset(&sin, 0, sizeof sin);
2172 sin.sin_family = AF_INET;
2173 sin.sin_addr = addr;
2176 memset(sa, 0, sizeof *sa);
2177 memcpy(sa, &sin, sizeof sin);
2181 do_set_addr(struct netdev *netdev,
2182 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2185 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2186 make_in4_sockaddr(&ifr.ifr_addr, addr);
2188 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2192 /* Adds 'router' as a default IP gateway. */
2194 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2196 struct in_addr any = { INADDR_ANY };
2200 memset(&rt, 0, sizeof rt);
2201 make_in4_sockaddr(&rt.rt_dst, any);
2202 make_in4_sockaddr(&rt.rt_gateway, router);
2203 make_in4_sockaddr(&rt.rt_genmask, any);
2204 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2205 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2207 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2213 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2216 static const char fn[] = "/proc/net/route";
2221 *netdev_name = NULL;
2222 stream = fopen(fn, "r");
2223 if (stream == NULL) {
2224 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2229 while (fgets(line, sizeof line, stream)) {
2232 ovs_be32 dest, gateway, mask;
2233 int refcnt, metric, mtu;
2234 unsigned int flags, use, window, irtt;
2237 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2239 iface, &dest, &gateway, &flags, &refcnt,
2240 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2242 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2246 if (!(flags & RTF_UP)) {
2247 /* Skip routes that aren't up. */
2251 /* The output of 'dest', 'mask', and 'gateway' were given in
2252 * network byte order, so we don't need need any endian
2253 * conversions here. */
2254 if ((dest & mask) == (host->s_addr & mask)) {
2256 /* The host is directly reachable. */
2257 next_hop->s_addr = 0;
2259 /* To reach the host, we must go through a gateway. */
2260 next_hop->s_addr = gateway;
2262 *netdev_name = xstrdup(iface);
2274 netdev_linux_get_drv_info(const struct netdev *netdev, struct shash *sh)
2277 struct netdev_dev_linux *netdev_dev =
2278 netdev_dev_linux_cast(netdev_get_dev(netdev));
2280 error = netdev_linux_get_drvinfo(netdev_dev);
2282 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2283 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2284 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2290 netdev_internal_get_drv_info(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2292 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2296 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2297 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2298 * returns 0. Otherwise, it returns a positive errno value; in particular,
2299 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2301 netdev_linux_arp_lookup(const struct netdev *netdev,
2302 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2305 struct sockaddr_in sin;
2308 memset(&r, 0, sizeof r);
2309 memset(&sin, 0, sizeof sin);
2310 sin.sin_family = AF_INET;
2311 sin.sin_addr.s_addr = ip;
2313 memcpy(&r.arp_pa, &sin, sizeof sin);
2314 r.arp_ha.sa_family = ARPHRD_ETHER;
2316 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2317 COVERAGE_INC(netdev_arp_lookup);
2318 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2320 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2321 } else if (retval != ENXIO) {
2322 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2323 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2329 nd_to_iff_flags(enum netdev_flags nd)
2332 if (nd & NETDEV_UP) {
2335 if (nd & NETDEV_PROMISC) {
2342 iff_to_nd_flags(int iff)
2344 enum netdev_flags nd = 0;
2348 if (iff & IFF_PROMISC) {
2349 nd |= NETDEV_PROMISC;
2355 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2356 enum netdev_flags on, enum netdev_flags *old_flagsp)
2358 struct netdev_dev_linux *netdev_dev;
2359 int old_flags, new_flags;
2362 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2363 old_flags = netdev_dev->ifi_flags;
2364 *old_flagsp = iff_to_nd_flags(old_flags);
2365 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2366 if (new_flags != old_flags) {
2367 error = set_flags(netdev, new_flags);
2368 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2374 netdev_linux_change_seq(const struct netdev *netdev)
2376 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2379 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2380 GET_FEATURES, GET_STATUS) \
2384 netdev_linux_init, \
2386 netdev_linux_wait, \
2389 netdev_linux_destroy, \
2390 NULL, /* get_config */ \
2391 NULL, /* set_config */ \
2393 netdev_linux_open, \
2394 netdev_linux_close, \
2396 netdev_linux_listen, \
2397 netdev_linux_recv, \
2398 netdev_linux_recv_wait, \
2399 netdev_linux_drain, \
2401 netdev_linux_send, \
2402 netdev_linux_send_wait, \
2404 netdev_linux_set_etheraddr, \
2405 netdev_linux_get_etheraddr, \
2406 netdev_linux_get_mtu, \
2407 netdev_linux_set_mtu, \
2408 netdev_linux_get_ifindex, \
2409 netdev_linux_get_carrier, \
2410 netdev_linux_get_carrier_resets, \
2411 netdev_linux_set_miimon_interval, \
2416 netdev_linux_set_advertisements, \
2418 netdev_linux_set_policing, \
2419 netdev_linux_get_qos_types, \
2420 netdev_linux_get_qos_capabilities, \
2421 netdev_linux_get_qos, \
2422 netdev_linux_set_qos, \
2423 netdev_linux_get_queue, \
2424 netdev_linux_set_queue, \
2425 netdev_linux_delete_queue, \
2426 netdev_linux_get_queue_stats, \
2427 netdev_linux_dump_queues, \
2428 netdev_linux_dump_queue_stats, \
2430 netdev_linux_get_in4, \
2431 netdev_linux_set_in4, \
2432 netdev_linux_get_in6, \
2433 netdev_linux_add_router, \
2434 netdev_linux_get_next_hop, \
2436 netdev_linux_arp_lookup, \
2438 netdev_linux_update_flags, \
2440 netdev_linux_change_seq \
2443 const struct netdev_class netdev_linux_class =
2446 netdev_linux_create,
2447 netdev_linux_get_stats,
2448 NULL, /* set_stats */
2449 netdev_linux_get_features,
2450 netdev_linux_get_drv_info);
2452 const struct netdev_class netdev_tap_class =
2455 netdev_linux_create_tap,
2456 netdev_tap_get_stats,
2457 NULL, /* set_stats */
2458 netdev_linux_get_features,
2459 netdev_linux_get_drv_info);
2461 const struct netdev_class netdev_internal_class =
2464 netdev_linux_create,
2465 netdev_internal_get_stats,
2466 netdev_vport_set_stats,
2467 NULL, /* get_features */
2468 netdev_internal_get_drv_info);
2470 /* HTB traffic control class. */
2472 #define HTB_N_QUEUES 0xf000
2476 unsigned int max_rate; /* In bytes/s. */
2480 struct tc_queue tc_queue;
2481 unsigned int min_rate; /* In bytes/s. */
2482 unsigned int max_rate; /* In bytes/s. */
2483 unsigned int burst; /* In bytes. */
2484 unsigned int priority; /* Lower values are higher priorities. */
2488 htb_get__(const struct netdev *netdev)
2490 struct netdev_dev_linux *netdev_dev =
2491 netdev_dev_linux_cast(netdev_get_dev(netdev));
2492 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2496 htb_install__(struct netdev *netdev, uint64_t max_rate)
2498 struct netdev_dev_linux *netdev_dev =
2499 netdev_dev_linux_cast(netdev_get_dev(netdev));
2502 htb = xmalloc(sizeof *htb);
2503 tc_init(&htb->tc, &tc_ops_htb);
2504 htb->max_rate = max_rate;
2506 netdev_dev->tc = &htb->tc;
2509 /* Create an HTB qdisc.
2511 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2513 htb_setup_qdisc__(struct netdev *netdev)
2516 struct tc_htb_glob opt;
2517 struct ofpbuf request;
2518 struct tcmsg *tcmsg;
2520 tc_del_qdisc(netdev);
2522 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2523 NLM_F_EXCL | NLM_F_CREATE, &request);
2527 tcmsg->tcm_handle = tc_make_handle(1, 0);
2528 tcmsg->tcm_parent = TC_H_ROOT;
2530 nl_msg_put_string(&request, TCA_KIND, "htb");
2532 memset(&opt, 0, sizeof opt);
2533 opt.rate2quantum = 10;
2537 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2538 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2539 nl_msg_end_nested(&request, opt_offset);
2541 return tc_transact(&request, NULL);
2544 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2545 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2547 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2548 unsigned int parent, struct htb_class *class)
2551 struct tc_htb_opt opt;
2552 struct ofpbuf request;
2553 struct tcmsg *tcmsg;
2557 error = netdev_get_mtu(netdev, &mtu);
2559 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2560 netdev_get_name(netdev));
2564 memset(&opt, 0, sizeof opt);
2565 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2566 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2567 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2568 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2569 opt.prio = class->priority;
2571 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2575 tcmsg->tcm_handle = handle;
2576 tcmsg->tcm_parent = parent;
2578 nl_msg_put_string(&request, TCA_KIND, "htb");
2579 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2580 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2581 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2582 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2583 nl_msg_end_nested(&request, opt_offset);
2585 error = tc_transact(&request, NULL);
2587 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2588 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2589 netdev_get_name(netdev),
2590 tc_get_major(handle), tc_get_minor(handle),
2591 tc_get_major(parent), tc_get_minor(parent),
2592 class->min_rate, class->max_rate,
2593 class->burst, class->priority, strerror(error));
2598 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2599 * description of them into 'details'. The description complies with the
2600 * specification given in the vswitch database documentation for linux-htb
2603 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2605 static const struct nl_policy tca_htb_policy[] = {
2606 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2607 .min_len = sizeof(struct tc_htb_opt) },
2610 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2611 const struct tc_htb_opt *htb;
2613 if (!nl_parse_nested(nl_options, tca_htb_policy,
2614 attrs, ARRAY_SIZE(tca_htb_policy))) {
2615 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2619 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2620 class->min_rate = htb->rate.rate;
2621 class->max_rate = htb->ceil.rate;
2622 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2623 class->priority = htb->prio;
2628 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2629 struct htb_class *options,
2630 struct netdev_queue_stats *stats)
2632 struct nlattr *nl_options;
2633 unsigned int handle;
2636 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2637 if (!error && queue_id) {
2638 unsigned int major = tc_get_major(handle);
2639 unsigned int minor = tc_get_minor(handle);
2640 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2641 *queue_id = minor - 1;
2646 if (!error && options) {
2647 error = htb_parse_tca_options__(nl_options, options);
2653 htb_parse_qdisc_details__(struct netdev *netdev,
2654 const struct shash *details, struct htb_class *hc)
2656 const char *max_rate_s;
2658 max_rate_s = shash_find_data(details, "max-rate");
2659 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2660 if (!hc->max_rate) {
2661 enum netdev_features current;
2663 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2664 hc->max_rate = netdev_features_to_bps(current) / 8;
2666 hc->min_rate = hc->max_rate;
2672 htb_parse_class_details__(struct netdev *netdev,
2673 const struct shash *details, struct htb_class *hc)
2675 const struct htb *htb = htb_get__(netdev);
2676 const char *min_rate_s = shash_find_data(details, "min-rate");
2677 const char *max_rate_s = shash_find_data(details, "max-rate");
2678 const char *burst_s = shash_find_data(details, "burst");
2679 const char *priority_s = shash_find_data(details, "priority");
2682 error = netdev_get_mtu(netdev, &mtu);
2684 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2685 netdev_get_name(netdev));
2689 /* HTB requires at least an mtu sized min-rate to send any traffic even
2690 * on uncongested links. */
2691 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2692 hc->min_rate = MAX(hc->min_rate, mtu);
2693 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2696 hc->max_rate = (max_rate_s
2697 ? strtoull(max_rate_s, NULL, 10) / 8
2699 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2700 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2704 * According to hints in the documentation that I've read, it is important
2705 * that 'burst' be at least as big as the largest frame that might be
2706 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2707 * but having it a bit too small is a problem. Since netdev_get_mtu()
2708 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2709 * the MTU. We actually add 64, instead of 14, as a guard against
2710 * additional headers get tacked on somewhere that we're not aware of. */
2711 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2712 hc->burst = MAX(hc->burst, mtu + 64);
2715 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2721 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2722 unsigned int parent, struct htb_class *options,
2723 struct netdev_queue_stats *stats)
2725 struct ofpbuf *reply;
2728 error = tc_query_class(netdev, handle, parent, &reply);
2730 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2731 ofpbuf_delete(reply);
2737 htb_tc_install(struct netdev *netdev, const struct shash *details)
2741 error = htb_setup_qdisc__(netdev);
2743 struct htb_class hc;
2745 htb_parse_qdisc_details__(netdev, details, &hc);
2746 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2747 tc_make_handle(1, 0), &hc);
2749 htb_install__(netdev, hc.max_rate);
2755 static struct htb_class *
2756 htb_class_cast__(const struct tc_queue *queue)
2758 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2762 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2763 const struct htb_class *hc)
2765 struct htb *htb = htb_get__(netdev);
2766 size_t hash = hash_int(queue_id, 0);
2767 struct tc_queue *queue;
2768 struct htb_class *hcp;
2770 queue = tc_find_queue__(netdev, queue_id, hash);
2772 hcp = htb_class_cast__(queue);
2774 hcp = xmalloc(sizeof *hcp);
2775 queue = &hcp->tc_queue;
2776 queue->queue_id = queue_id;
2777 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2780 hcp->min_rate = hc->min_rate;
2781 hcp->max_rate = hc->max_rate;
2782 hcp->burst = hc->burst;
2783 hcp->priority = hc->priority;
2787 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2790 struct nl_dump dump;
2791 struct htb_class hc;
2793 /* Get qdisc options. */
2795 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2796 htb_install__(netdev, hc.max_rate);
2799 if (!start_queue_dump(netdev, &dump)) {
2802 while (nl_dump_next(&dump, &msg)) {
2803 unsigned int queue_id;
2805 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2806 htb_update_queue__(netdev, queue_id, &hc);
2809 nl_dump_done(&dump);
2815 htb_tc_destroy(struct tc *tc)
2817 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2818 struct htb_class *hc, *next;
2820 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2821 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2829 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2831 const struct htb *htb = htb_get__(netdev);
2832 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2837 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2839 struct htb_class hc;
2842 htb_parse_qdisc_details__(netdev, details, &hc);
2843 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2844 tc_make_handle(1, 0), &hc);
2846 htb_get__(netdev)->max_rate = hc.max_rate;
2852 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2853 const struct tc_queue *queue, struct shash *details)
2855 const struct htb_class *hc = htb_class_cast__(queue);
2857 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2858 if (hc->min_rate != hc->max_rate) {
2859 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2861 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2863 shash_add(details, "priority", xasprintf("%u", hc->priority));
2869 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2870 const struct shash *details)
2872 struct htb_class hc;
2875 error = htb_parse_class_details__(netdev, details, &hc);
2880 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2881 tc_make_handle(1, 0xfffe), &hc);
2886 htb_update_queue__(netdev, queue_id, &hc);
2891 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2893 struct htb_class *hc = htb_class_cast__(queue);
2894 struct htb *htb = htb_get__(netdev);
2897 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2899 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2906 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2907 struct netdev_queue_stats *stats)
2909 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2910 tc_make_handle(1, 0xfffe), NULL, stats);
2914 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2915 const struct ofpbuf *nlmsg,
2916 netdev_dump_queue_stats_cb *cb, void *aux)
2918 struct netdev_queue_stats stats;
2919 unsigned int handle, major, minor;
2922 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2927 major = tc_get_major(handle);
2928 minor = tc_get_minor(handle);
2929 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2930 (*cb)(minor - 1, &stats, aux);
2935 static const struct tc_ops tc_ops_htb = {
2936 "htb", /* linux_name */
2937 "linux-htb", /* ovs_name */
2938 HTB_N_QUEUES, /* n_queues */
2947 htb_class_get_stats,
2948 htb_class_dump_stats
2951 /* "linux-hfsc" traffic control class. */
2953 #define HFSC_N_QUEUES 0xf000
2961 struct tc_queue tc_queue;
2966 static struct hfsc *
2967 hfsc_get__(const struct netdev *netdev)
2969 struct netdev_dev_linux *netdev_dev;
2970 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2971 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2974 static struct hfsc_class *
2975 hfsc_class_cast__(const struct tc_queue *queue)
2977 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2981 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2983 struct netdev_dev_linux * netdev_dev;
2986 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2987 hfsc = xmalloc(sizeof *hfsc);
2988 tc_init(&hfsc->tc, &tc_ops_hfsc);
2989 hfsc->max_rate = max_rate;
2990 netdev_dev->tc = &hfsc->tc;
2994 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2995 const struct hfsc_class *hc)
2999 struct hfsc_class *hcp;
3000 struct tc_queue *queue;
3002 hfsc = hfsc_get__(netdev);
3003 hash = hash_int(queue_id, 0);
3005 queue = tc_find_queue__(netdev, queue_id, hash);
3007 hcp = hfsc_class_cast__(queue);
3009 hcp = xmalloc(sizeof *hcp);
3010 queue = &hcp->tc_queue;
3011 queue->queue_id = queue_id;
3012 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3015 hcp->min_rate = hc->min_rate;
3016 hcp->max_rate = hc->max_rate;
3020 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3022 const struct tc_service_curve *rsc, *fsc, *usc;
3023 static const struct nl_policy tca_hfsc_policy[] = {
3025 .type = NL_A_UNSPEC,
3027 .min_len = sizeof(struct tc_service_curve),
3030 .type = NL_A_UNSPEC,
3032 .min_len = sizeof(struct tc_service_curve),
3035 .type = NL_A_UNSPEC,
3037 .min_len = sizeof(struct tc_service_curve),
3040 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3042 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3043 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3044 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3048 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3049 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3050 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3052 if (rsc->m1 != 0 || rsc->d != 0 ||
3053 fsc->m1 != 0 || fsc->d != 0 ||
3054 usc->m1 != 0 || usc->d != 0) {
3055 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3056 "Non-linear service curves are not supported.");
3060 if (rsc->m2 != fsc->m2) {
3061 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3062 "Real-time service curves are not supported ");
3066 if (rsc->m2 > usc->m2) {
3067 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3068 "Min-rate service curve is greater than "
3069 "the max-rate service curve.");
3073 class->min_rate = fsc->m2;
3074 class->max_rate = usc->m2;
3079 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3080 struct hfsc_class *options,
3081 struct netdev_queue_stats *stats)
3084 unsigned int handle;
3085 struct nlattr *nl_options;
3087 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3093 unsigned int major, minor;
3095 major = tc_get_major(handle);
3096 minor = tc_get_minor(handle);
3097 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3098 *queue_id = minor - 1;
3105 error = hfsc_parse_tca_options__(nl_options, options);
3112 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3113 unsigned int parent, struct hfsc_class *options,
3114 struct netdev_queue_stats *stats)
3117 struct ofpbuf *reply;
3119 error = tc_query_class(netdev, handle, parent, &reply);
3124 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3125 ofpbuf_delete(reply);
3130 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3131 struct hfsc_class *class)
3134 const char *max_rate_s;
3136 max_rate_s = shash_find_data(details, "max-rate");
3137 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3140 enum netdev_features current;
3142 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3143 max_rate = netdev_features_to_bps(current) / 8;
3146 class->min_rate = max_rate;
3147 class->max_rate = max_rate;
3151 hfsc_parse_class_details__(struct netdev *netdev,
3152 const struct shash *details,
3153 struct hfsc_class * class)
3155 const struct hfsc *hfsc;
3156 uint32_t min_rate, max_rate;
3157 const char *min_rate_s, *max_rate_s;
3159 hfsc = hfsc_get__(netdev);
3160 min_rate_s = shash_find_data(details, "min-rate");
3161 max_rate_s = shash_find_data(details, "max-rate");
3163 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3164 min_rate = MAX(min_rate, 1);
3165 min_rate = MIN(min_rate, hfsc->max_rate);
3167 max_rate = (max_rate_s
3168 ? strtoull(max_rate_s, NULL, 10) / 8
3170 max_rate = MAX(max_rate, min_rate);
3171 max_rate = MIN(max_rate, hfsc->max_rate);
3173 class->min_rate = min_rate;
3174 class->max_rate = max_rate;
3179 /* Create an HFSC qdisc.
3181 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3183 hfsc_setup_qdisc__(struct netdev * netdev)
3185 struct tcmsg *tcmsg;
3186 struct ofpbuf request;
3187 struct tc_hfsc_qopt opt;
3189 tc_del_qdisc(netdev);
3191 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3192 NLM_F_EXCL | NLM_F_CREATE, &request);
3198 tcmsg->tcm_handle = tc_make_handle(1, 0);
3199 tcmsg->tcm_parent = TC_H_ROOT;
3201 memset(&opt, 0, sizeof opt);
3204 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3205 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3207 return tc_transact(&request, NULL);
3210 /* Create an HFSC class.
3212 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3213 * sc rate <min_rate> ul rate <max_rate>" */
3215 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3216 unsigned int parent, struct hfsc_class *class)
3220 struct tcmsg *tcmsg;
3221 struct ofpbuf request;
3222 struct tc_service_curve min, max;
3224 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3230 tcmsg->tcm_handle = handle;
3231 tcmsg->tcm_parent = parent;
3235 min.m2 = class->min_rate;
3239 max.m2 = class->max_rate;
3241 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3242 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3243 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3244 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3245 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3246 nl_msg_end_nested(&request, opt_offset);
3248 error = tc_transact(&request, NULL);
3250 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3251 "min-rate %ubps, max-rate %ubps (%s)",
3252 netdev_get_name(netdev),
3253 tc_get_major(handle), tc_get_minor(handle),
3254 tc_get_major(parent), tc_get_minor(parent),
3255 class->min_rate, class->max_rate, strerror(error));
3262 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3265 struct hfsc_class class;
3267 error = hfsc_setup_qdisc__(netdev);
3273 hfsc_parse_qdisc_details__(netdev, details, &class);
3274 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3275 tc_make_handle(1, 0), &class);
3281 hfsc_install__(netdev, class.max_rate);
3286 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3289 struct nl_dump dump;
3290 struct hfsc_class hc;
3293 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3294 hfsc_install__(netdev, hc.max_rate);
3296 if (!start_queue_dump(netdev, &dump)) {
3300 while (nl_dump_next(&dump, &msg)) {
3301 unsigned int queue_id;
3303 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3304 hfsc_update_queue__(netdev, queue_id, &hc);
3308 nl_dump_done(&dump);
3313 hfsc_tc_destroy(struct tc *tc)
3316 struct hfsc_class *hc, *next;
3318 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3320 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3321 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3330 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3332 const struct hfsc *hfsc;
3333 hfsc = hfsc_get__(netdev);
3334 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3339 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3342 struct hfsc_class class;
3344 hfsc_parse_qdisc_details__(netdev, details, &class);
3345 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3346 tc_make_handle(1, 0), &class);
3349 hfsc_get__(netdev)->max_rate = class.max_rate;
3356 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3357 const struct tc_queue *queue, struct shash *details)
3359 const struct hfsc_class *hc;
3361 hc = hfsc_class_cast__(queue);
3362 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3363 if (hc->min_rate != hc->max_rate) {
3364 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3370 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3371 const struct shash *details)
3374 struct hfsc_class class;
3376 error = hfsc_parse_class_details__(netdev, details, &class);
3381 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3382 tc_make_handle(1, 0xfffe), &class);
3387 hfsc_update_queue__(netdev, queue_id, &class);
3392 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3396 struct hfsc_class *hc;
3398 hc = hfsc_class_cast__(queue);
3399 hfsc = hfsc_get__(netdev);
3401 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3403 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3410 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3411 struct netdev_queue_stats *stats)
3413 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3414 tc_make_handle(1, 0xfffe), NULL, stats);
3418 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3419 const struct ofpbuf *nlmsg,
3420 netdev_dump_queue_stats_cb *cb, void *aux)
3422 struct netdev_queue_stats stats;
3423 unsigned int handle, major, minor;
3426 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3431 major = tc_get_major(handle);
3432 minor = tc_get_minor(handle);
3433 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3434 (*cb)(minor - 1, &stats, aux);
3439 static const struct tc_ops tc_ops_hfsc = {
3440 "hfsc", /* linux_name */
3441 "linux-hfsc", /* ovs_name */
3442 HFSC_N_QUEUES, /* n_queues */
3443 hfsc_tc_install, /* tc_install */
3444 hfsc_tc_load, /* tc_load */
3445 hfsc_tc_destroy, /* tc_destroy */
3446 hfsc_qdisc_get, /* qdisc_get */
3447 hfsc_qdisc_set, /* qdisc_set */
3448 hfsc_class_get, /* class_get */
3449 hfsc_class_set, /* class_set */
3450 hfsc_class_delete, /* class_delete */
3451 hfsc_class_get_stats, /* class_get_stats */
3452 hfsc_class_dump_stats /* class_dump_stats */
3455 /* "linux-default" traffic control class.
3457 * This class represents the default, unnamed Linux qdisc. It corresponds to
3458 * the "" (empty string) QoS type in the OVS database. */
3461 default_install__(struct netdev *netdev)
3463 struct netdev_dev_linux *netdev_dev =
3464 netdev_dev_linux_cast(netdev_get_dev(netdev));
3465 static struct tc *tc;
3468 tc = xmalloc(sizeof *tc);
3469 tc_init(tc, &tc_ops_default);
3471 netdev_dev->tc = tc;
3475 default_tc_install(struct netdev *netdev,
3476 const struct shash *details OVS_UNUSED)
3478 default_install__(netdev);
3483 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3485 default_install__(netdev);
3489 static const struct tc_ops tc_ops_default = {
3490 NULL, /* linux_name */
3495 NULL, /* tc_destroy */
3496 NULL, /* qdisc_get */
3497 NULL, /* qdisc_set */
3498 NULL, /* class_get */
3499 NULL, /* class_set */
3500 NULL, /* class_delete */
3501 NULL, /* class_get_stats */
3502 NULL /* class_dump_stats */
3505 /* "linux-other" traffic control class.
3510 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3512 struct netdev_dev_linux *netdev_dev =
3513 netdev_dev_linux_cast(netdev_get_dev(netdev));
3514 static struct tc *tc;
3517 tc = xmalloc(sizeof *tc);
3518 tc_init(tc, &tc_ops_other);
3520 netdev_dev->tc = tc;
3524 static const struct tc_ops tc_ops_other = {
3525 NULL, /* linux_name */
3526 "linux-other", /* ovs_name */
3528 NULL, /* tc_install */
3530 NULL, /* tc_destroy */
3531 NULL, /* qdisc_get */
3532 NULL, /* qdisc_set */
3533 NULL, /* class_get */
3534 NULL, /* class_set */
3535 NULL, /* class_delete */
3536 NULL, /* class_get_stats */
3537 NULL /* class_dump_stats */
3540 /* Traffic control. */
3542 /* Number of kernel "tc" ticks per second. */
3543 static double ticks_per_s;
3545 /* Number of kernel "jiffies" per second. This is used for the purpose of
3546 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3547 * one jiffy's worth of data.
3549 * There are two possibilities here:
3551 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3552 * approximate range of 100 to 1024. That means that we really need to
3553 * make sure that the qdisc can buffer that much data.
3555 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3556 * has finely granular timers and there's no need to fudge additional room
3557 * for buffers. (There's no extra effort needed to implement that: the
3558 * large 'buffer_hz' is used as a divisor, so practically any number will
3559 * come out as 0 in the division. Small integer results in the case of
3560 * really high dividends won't have any real effect anyhow.)
3562 static unsigned int buffer_hz;
3564 /* Returns tc handle 'major':'minor'. */
3566 tc_make_handle(unsigned int major, unsigned int minor)
3568 return TC_H_MAKE(major << 16, minor);
3571 /* Returns the major number from 'handle'. */
3573 tc_get_major(unsigned int handle)
3575 return TC_H_MAJ(handle) >> 16;
3578 /* Returns the minor number from 'handle'. */
3580 tc_get_minor(unsigned int handle)
3582 return TC_H_MIN(handle);
3585 static struct tcmsg *
3586 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3587 struct ofpbuf *request)
3589 struct tcmsg *tcmsg;
3593 error = get_ifindex(netdev, &ifindex);
3598 ofpbuf_init(request, 512);
3599 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3600 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3601 tcmsg->tcm_family = AF_UNSPEC;
3602 tcmsg->tcm_ifindex = ifindex;
3603 /* Caller should fill in tcmsg->tcm_handle. */
3604 /* Caller should fill in tcmsg->tcm_parent. */
3610 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3612 int error = nl_sock_transact(rtnl_sock, request, replyp);
3613 ofpbuf_uninit(request);
3617 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3618 * policing configuration.
3620 * This function is equivalent to running the following when 'add' is true:
3621 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3623 * This function is equivalent to running the following when 'add' is false:
3624 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3626 * The configuration and stats may be seen with the following command:
3627 * /sbin/tc -s qdisc show dev <devname>
3629 * Returns 0 if successful, otherwise a positive errno value.
3632 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3634 struct ofpbuf request;
3635 struct tcmsg *tcmsg;
3637 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3638 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3640 tcmsg = tc_make_request(netdev, type, flags, &request);
3644 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3645 tcmsg->tcm_parent = TC_H_INGRESS;
3646 nl_msg_put_string(&request, TCA_KIND, "ingress");
3647 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3649 error = tc_transact(&request, NULL);
3651 /* If we're deleting the qdisc, don't worry about some of the
3652 * error conditions. */
3653 if (!add && (error == ENOENT || error == EINVAL)) {
3662 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3665 * This function is equivalent to running:
3666 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3667 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3670 * The configuration and stats may be seen with the following command:
3671 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3673 * Returns 0 if successful, otherwise a positive errno value.
3676 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3678 struct tc_police tc_police;
3679 struct ofpbuf request;
3680 struct tcmsg *tcmsg;
3681 size_t basic_offset;
3682 size_t police_offset;
3686 memset(&tc_police, 0, sizeof tc_police);
3687 tc_police.action = TC_POLICE_SHOT;
3688 tc_police.mtu = mtu;
3689 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3690 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3691 kbits_burst * 1024);
3693 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3694 NLM_F_EXCL | NLM_F_CREATE, &request);
3698 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3699 tcmsg->tcm_info = tc_make_handle(49,
3700 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3702 nl_msg_put_string(&request, TCA_KIND, "basic");
3703 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3704 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3705 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3706 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3707 nl_msg_end_nested(&request, police_offset);
3708 nl_msg_end_nested(&request, basic_offset);
3710 error = tc_transact(&request, NULL);
3721 /* The values in psched are not individually very meaningful, but they are
3722 * important. The tables below show some values seen in the wild.
3726 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3727 * (Before that, there are hints that it was 1000000000.)
3729 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3733 * -----------------------------------
3734 * [1] 000c8000 000f4240 000f4240 00000064
3735 * [2] 000003e8 00000400 000f4240 3b9aca00
3736 * [3] 000003e8 00000400 000f4240 3b9aca00
3737 * [4] 000003e8 00000400 000f4240 00000064
3738 * [5] 000003e8 00000040 000f4240 3b9aca00
3739 * [6] 000003e8 00000040 000f4240 000000f9
3741 * a b c d ticks_per_s buffer_hz
3742 * ------- --------- ---------- ------------- ----------- -------------
3743 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3744 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3745 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3746 * [4] 1,000 1,024 1,000,000 100 976,562 100
3747 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3748 * [6] 1,000 64 1,000,000 249 15,625,000 249
3750 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3751 * [2] 2.6.26-1-686-bigmem from Debian lenny
3752 * [3] 2.6.26-2-sparc64 from Debian lenny
3753 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3754 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3755 * [6] 2.6.34 from kernel.org on KVM
3757 static const char fn[] = "/proc/net/psched";
3758 unsigned int a, b, c, d;
3764 stream = fopen(fn, "r");
3766 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3770 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3771 VLOG_WARN("%s: read failed", fn);
3775 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3779 VLOG_WARN("%s: invalid scheduler parameters", fn);
3783 ticks_per_s = (double) a * c / b;
3787 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3790 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3793 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3794 * rate of 'rate' bytes per second. */
3796 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3801 return (rate * ticks) / ticks_per_s;
3804 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3805 * rate of 'rate' bytes per second. */
3807 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3812 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3815 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3816 * a transmission rate of 'rate' bytes per second. */
3818 tc_buffer_per_jiffy(unsigned int rate)
3823 return rate / buffer_hz;
3826 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3827 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3828 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3829 * stores NULL into it if it is absent.
3831 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3834 * Returns 0 if successful, otherwise a positive errno value. */
3836 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3837 struct nlattr **options)
3839 static const struct nl_policy tca_policy[] = {
3840 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3841 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3843 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3845 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3846 tca_policy, ta, ARRAY_SIZE(ta))) {
3847 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3852 *kind = nl_attr_get_string(ta[TCA_KIND]);
3856 *options = ta[TCA_OPTIONS];
3871 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3872 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3873 * into '*options', and its queue statistics into '*stats'. Any of the output
3874 * arguments may be null.
3876 * Returns 0 if successful, otherwise a positive errno value. */
3878 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3879 struct nlattr **options, struct netdev_queue_stats *stats)
3881 static const struct nl_policy tca_policy[] = {
3882 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3883 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3885 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3887 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3888 tca_policy, ta, ARRAY_SIZE(ta))) {
3889 VLOG_WARN_RL(&rl, "failed to parse class message");
3894 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3895 *handlep = tc->tcm_handle;
3899 *options = ta[TCA_OPTIONS];
3903 const struct gnet_stats_queue *gsq;
3904 struct gnet_stats_basic gsb;
3906 static const struct nl_policy stats_policy[] = {
3907 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3908 .min_len = sizeof gsb },
3909 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3910 .min_len = sizeof *gsq },
3912 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3914 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3915 sa, ARRAY_SIZE(sa))) {
3916 VLOG_WARN_RL(&rl, "failed to parse class stats");
3920 /* Alignment issues screw up the length of struct gnet_stats_basic on
3921 * some arch/bitsize combinations. Newer versions of Linux have a
3922 * struct gnet_stats_basic_packed, but we can't depend on that. The
3923 * easiest thing to do is just to make a copy. */
3924 memset(&gsb, 0, sizeof gsb);
3925 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3926 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3927 stats->tx_bytes = gsb.bytes;
3928 stats->tx_packets = gsb.packets;
3930 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3931 stats->tx_errors = gsq->drops;
3941 memset(stats, 0, sizeof *stats);
3946 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3949 tc_query_class(const struct netdev *netdev,
3950 unsigned int handle, unsigned int parent,
3951 struct ofpbuf **replyp)
3953 struct ofpbuf request;
3954 struct tcmsg *tcmsg;
3957 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3961 tcmsg->tcm_handle = handle;
3962 tcmsg->tcm_parent = parent;
3964 error = tc_transact(&request, replyp);
3966 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3967 netdev_get_name(netdev),
3968 tc_get_major(handle), tc_get_minor(handle),
3969 tc_get_major(parent), tc_get_minor(parent),
3975 /* Equivalent to "tc class del dev <name> handle <handle>". */
3977 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3979 struct ofpbuf request;
3980 struct tcmsg *tcmsg;
3983 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3987 tcmsg->tcm_handle = handle;
3988 tcmsg->tcm_parent = 0;
3990 error = tc_transact(&request, NULL);
3992 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3993 netdev_get_name(netdev),
3994 tc_get_major(handle), tc_get_minor(handle),
4000 /* Equivalent to "tc qdisc del dev <name> root". */
4002 tc_del_qdisc(struct netdev *netdev)
4004 struct netdev_dev_linux *netdev_dev =
4005 netdev_dev_linux_cast(netdev_get_dev(netdev));
4006 struct ofpbuf request;
4007 struct tcmsg *tcmsg;
4010 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
4014 tcmsg->tcm_handle = tc_make_handle(1, 0);
4015 tcmsg->tcm_parent = TC_H_ROOT;
4017 error = tc_transact(&request, NULL);
4018 if (error == EINVAL) {
4019 /* EINVAL probably means that the default qdisc was in use, in which
4020 * case we've accomplished our purpose. */
4023 if (!error && netdev_dev->tc) {
4024 if (netdev_dev->tc->ops->tc_destroy) {
4025 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4027 netdev_dev->tc = NULL;
4032 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4033 * kernel to determine what they are. Returns 0 if successful, otherwise a
4034 * positive errno value. */
4036 tc_query_qdisc(const struct netdev *netdev)
4038 struct netdev_dev_linux *netdev_dev =
4039 netdev_dev_linux_cast(netdev_get_dev(netdev));
4040 struct ofpbuf request, *qdisc;
4041 const struct tc_ops *ops;
4042 struct tcmsg *tcmsg;
4046 if (netdev_dev->tc) {
4050 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4051 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4052 * 2.6.35 without that fix backported to it.
4054 * To avoid the OOPS, we must not make a request that would attempt to dump
4055 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4056 * few others. There are a few ways that I can see to do this, but most of
4057 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4058 * technique chosen here is to assume that any non-default qdisc that we
4059 * create will have a class with handle 1:0. The built-in qdiscs only have
4060 * a class with handle 0:0.
4062 * We could check for Linux 2.6.35+ and use a more straightforward method
4064 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4068 tcmsg->tcm_handle = tc_make_handle(1, 0);
4069 tcmsg->tcm_parent = 0;
4071 /* Figure out what tc class to instantiate. */
4072 error = tc_transact(&request, &qdisc);
4076 error = tc_parse_qdisc(qdisc, &kind, NULL);
4078 ops = &tc_ops_other;
4080 ops = tc_lookup_linux_name(kind);
4082 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4083 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4085 ops = &tc_ops_other;
4088 } else if (error == ENOENT) {
4089 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4090 * other entity that doesn't have a handle 1:0. We will assume
4091 * that it's the system default qdisc. */
4092 ops = &tc_ops_default;
4095 /* Who knows? Maybe the device got deleted. */
4096 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4097 netdev_get_name(netdev), strerror(error));
4098 ops = &tc_ops_other;
4101 /* Instantiate it. */
4102 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4103 assert((load_error == 0) == (netdev_dev->tc != NULL));
4104 ofpbuf_delete(qdisc);
4106 return error ? error : load_error;
4109 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4110 approximate the time to transmit packets of various lengths. For an MTU of
4111 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4112 represents two possible packet lengths; for a MTU of 513 through 1024, four
4113 possible lengths; and so on.
4115 Returns, for the specified 'mtu', the number of bits that packet lengths
4116 need to be shifted right to fit within such a 256-entry table. */
4118 tc_calc_cell_log(unsigned int mtu)
4123 mtu = ETH_PAYLOAD_MAX;
4125 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4127 for (cell_log = 0; mtu >= 256; cell_log++) {
4134 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4137 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4139 memset(rate, 0, sizeof *rate);
4140 rate->cell_log = tc_calc_cell_log(mtu);
4141 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4142 /* rate->cell_align = 0; */ /* distro headers. */
4143 rate->mpu = ETH_TOTAL_MIN;
4147 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4148 * attribute of the specified "type".
4150 * See tc_calc_cell_log() above for a description of "rtab"s. */
4152 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4157 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4158 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4159 unsigned packet_size = (i + 1) << rate->cell_log;
4160 if (packet_size < rate->mpu) {
4161 packet_size = rate->mpu;
4163 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4167 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4168 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4169 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4172 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4174 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4175 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4178 /* Linux-only functions declared in netdev-linux.h */
4180 /* Returns a fd for an AF_INET socket or a negative errno value. */
4182 netdev_linux_get_af_inet_sock(void)
4184 int error = netdev_linux_init();
4185 return error ? -error : af_inet_sock;
4188 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4189 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4191 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4192 const char *flag_name, bool enable)
4194 const char *netdev_name = netdev_get_name(netdev);
4195 struct ethtool_value evalue;
4199 memset(&evalue, 0, sizeof evalue);
4200 error = netdev_linux_do_ethtool(netdev_name,
4201 (struct ethtool_cmd *)&evalue,
4202 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4207 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4208 error = netdev_linux_do_ethtool(netdev_name,
4209 (struct ethtool_cmd *)&evalue,
4210 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4215 memset(&evalue, 0, sizeof evalue);
4216 error = netdev_linux_do_ethtool(netdev_name,
4217 (struct ethtool_cmd *)&evalue,
4218 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4223 if (new_flags != evalue.data) {
4224 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4225 "device %s failed", enable ? "enable" : "disable",
4226 flag_name, netdev_name);
4233 /* Utility functions. */
4235 /* Copies 'src' into 'dst', performing format conversion in the process. */
4237 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4238 const struct rtnl_link_stats *src)
4240 dst->rx_packets = src->rx_packets;
4241 dst->tx_packets = src->tx_packets;
4242 dst->rx_bytes = src->rx_bytes;
4243 dst->tx_bytes = src->tx_bytes;
4244 dst->rx_errors = src->rx_errors;
4245 dst->tx_errors = src->tx_errors;
4246 dst->rx_dropped = src->rx_dropped;
4247 dst->tx_dropped = src->tx_dropped;
4248 dst->multicast = src->multicast;
4249 dst->collisions = src->collisions;
4250 dst->rx_length_errors = src->rx_length_errors;
4251 dst->rx_over_errors = src->rx_over_errors;
4252 dst->rx_crc_errors = src->rx_crc_errors;
4253 dst->rx_frame_errors = src->rx_frame_errors;
4254 dst->rx_fifo_errors = src->rx_fifo_errors;
4255 dst->rx_missed_errors = src->rx_missed_errors;
4256 dst->tx_aborted_errors = src->tx_aborted_errors;
4257 dst->tx_carrier_errors = src->tx_carrier_errors;
4258 dst->tx_fifo_errors = src->tx_fifo_errors;
4259 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4260 dst->tx_window_errors = src->tx_window_errors;
4264 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4266 /* Policy for RTNLGRP_LINK messages.
4268 * There are *many* more fields in these messages, but currently we only
4269 * care about these fields. */
4270 static const struct nl_policy rtnlgrp_link_policy[] = {
4271 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4272 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4273 .min_len = sizeof(struct rtnl_link_stats) },
4276 struct ofpbuf request;
4277 struct ofpbuf *reply;
4278 struct ifinfomsg *ifi;
4279 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4282 ofpbuf_init(&request, 0);
4283 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4284 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4285 ifi->ifi_family = PF_UNSPEC;
4286 ifi->ifi_index = ifindex;
4287 error = nl_sock_transact(rtnl_sock, &request, &reply);
4288 ofpbuf_uninit(&request);
4293 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4294 rtnlgrp_link_policy,
4295 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4296 ofpbuf_delete(reply);
4300 if (!attrs[IFLA_STATS]) {
4301 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4302 ofpbuf_delete(reply);
4306 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4308 ofpbuf_delete(reply);
4314 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4316 static const char fn[] = "/proc/net/dev";
4321 stream = fopen(fn, "r");
4323 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4328 while (fgets(line, sizeof line, stream)) {
4331 #define X64 "%"SCNu64
4334 X64 X64 X64 X64 X64 X64 X64 "%*u"
4335 X64 X64 X64 X64 X64 X64 X64 "%*u",
4341 &stats->rx_fifo_errors,
4342 &stats->rx_frame_errors,
4348 &stats->tx_fifo_errors,
4350 &stats->tx_carrier_errors) != 15) {
4351 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4352 } else if (!strcmp(devname, netdev_name)) {
4353 stats->rx_length_errors = UINT64_MAX;
4354 stats->rx_over_errors = UINT64_MAX;
4355 stats->rx_crc_errors = UINT64_MAX;
4356 stats->rx_missed_errors = UINT64_MAX;
4357 stats->tx_aborted_errors = UINT64_MAX;
4358 stats->tx_heartbeat_errors = UINT64_MAX;
4359 stats->tx_window_errors = UINT64_MAX;
4365 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4371 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4377 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4380 *flags = ifr.ifr_flags;
4386 set_flags(struct netdev *netdev, unsigned int flags)
4390 ifr.ifr_flags = flags;
4391 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4396 do_get_ifindex(const char *netdev_name)
4400 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4401 COVERAGE_INC(netdev_get_ifindex);
4402 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4403 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4404 netdev_name, strerror(errno));
4407 return ifr.ifr_ifindex;
4411 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4413 struct netdev_dev_linux *netdev_dev =
4414 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4416 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4417 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4420 netdev_dev->get_ifindex_error = -ifindex;
4421 netdev_dev->ifindex = 0;
4423 netdev_dev->get_ifindex_error = 0;
4424 netdev_dev->ifindex = ifindex;
4426 netdev_dev->cache_valid |= VALID_IFINDEX;
4429 *ifindexp = netdev_dev->ifindex;
4430 return netdev_dev->get_ifindex_error;
4434 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4439 memset(&ifr, 0, sizeof ifr);
4440 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4441 COVERAGE_INC(netdev_get_hwaddr);
4442 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4443 /* ENODEV probably means that a vif disappeared asynchronously and
4444 * hasn't been removed from the database yet, so reduce the log level
4445 * to INFO for that case. */
4446 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4447 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4448 netdev_name, strerror(errno));
4451 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4452 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4453 VLOG_WARN("%s device has unknown hardware address family %d",
4454 netdev_name, hwaddr_family);
4456 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4461 set_etheraddr(const char *netdev_name,
4462 const uint8_t mac[ETH_ADDR_LEN])
4466 memset(&ifr, 0, sizeof ifr);
4467 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4468 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4469 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4470 COVERAGE_INC(netdev_set_hwaddr);
4471 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4472 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4473 netdev_name, strerror(errno));
4480 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4481 int cmd, const char *cmd_name)
4485 memset(&ifr, 0, sizeof ifr);
4486 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4487 ifr.ifr_data = (caddr_t) ecmd;
4490 COVERAGE_INC(netdev_ethtool);
4491 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4494 if (errno != EOPNOTSUPP) {
4495 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4496 "failed: %s", cmd_name, name, strerror(errno));
4498 /* The device doesn't support this operation. That's pretty
4499 * common, so there's no point in logging anything. */
4506 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4507 const char *cmd_name)
4509 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4510 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4511 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4519 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4520 int cmd, const char *cmd_name)
4525 ifr.ifr_addr.sa_family = AF_INET;
4526 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4528 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4529 *ip = sin->sin_addr;
4534 /* Returns an AF_PACKET raw socket or a negative errno value. */
4536 af_packet_sock(void)
4538 static int sock = INT_MIN;
4540 if (sock == INT_MIN) {
4541 sock = socket(AF_PACKET, SOCK_RAW, 0);
4543 set_nonblocking(sock);
4546 VLOG_ERR("failed to create packet socket: %s", strerror(errno));