2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct shash *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_dev_linux {
356 struct netdev_dev netdev_dev;
358 struct shash_node *shash_node;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
380 int ether_addr_error; /* Cached error code from set/get etheraddr. */
382 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
386 struct tap_state tap;
390 struct netdev_linux {
391 struct netdev netdev;
395 /* Sockets used for ioctl operations. */
396 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
398 /* A Netlink routing socket that is not subscribed to any multicast groups. */
399 static struct nl_sock *rtnl_sock;
401 /* This is set pretty low because we probably won't learn anything from the
402 * additional log messages. */
403 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
405 static int netdev_linux_init(void);
407 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
408 int cmd, const char *cmd_name);
409 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
410 const char *cmd_name);
411 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
412 int cmd, const char *cmd_name);
413 static int get_flags(const struct netdev_dev *, unsigned int *flags);
414 static int set_flags(struct netdev *, unsigned int flags);
415 static int do_get_ifindex(const char *netdev_name);
416 static int get_ifindex(const struct netdev *, int *ifindexp);
417 static int do_set_addr(struct netdev *netdev,
418 int ioctl_nr, const char *ioctl_name,
419 struct in_addr addr);
420 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
421 static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
422 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
423 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
424 static int af_packet_sock(void);
425 static void netdev_linux_miimon_run(void);
426 static void netdev_linux_miimon_wait(void);
429 is_netdev_linux_class(const struct netdev_class *netdev_class)
431 return netdev_class->init == netdev_linux_init;
434 static struct netdev_dev_linux *
435 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
437 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
438 assert(is_netdev_linux_class(netdev_class));
440 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
443 static struct netdev_linux *
444 netdev_linux_cast(const struct netdev *netdev)
446 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
447 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
448 assert(is_netdev_linux_class(netdev_class));
450 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
454 netdev_linux_init(void)
456 static int status = -1;
458 /* Create AF_INET socket. */
459 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
460 status = af_inet_sock >= 0 ? 0 : errno;
462 VLOG_ERR("failed to create inet socket: %s", strerror(status));
465 /* Create rtnetlink socket. */
467 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
469 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
478 netdev_linux_run(void)
480 rtnetlink_link_run();
481 netdev_linux_miimon_run();
485 netdev_linux_wait(void)
487 rtnetlink_link_wait();
488 netdev_linux_miimon_wait();
492 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
497 if (netdev_dev->cache_valid & VALID_DRVINFO) {
501 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
502 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
503 (struct ethtool_cmd *)&netdev_dev->drvinfo,
507 netdev_dev->cache_valid |= VALID_DRVINFO;
513 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
514 unsigned int ifi_flags,
518 if (!dev->change_seq) {
522 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
523 dev->carrier_resets++;
525 dev->ifi_flags = ifi_flags;
527 dev->cache_valid &= mask;
531 netdev_dev_linux_update(struct netdev_dev_linux *dev,
532 const struct rtnetlink_link_change *change)
534 if (change->nlmsg_type == RTM_NEWLINK) {
536 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
539 dev->mtu = change->mtu;
540 dev->cache_valid |= VALID_MTU;
541 dev->netdev_mtu_error = 0;
544 if (!eth_addr_is_zero(change->addr)) {
545 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
546 dev->cache_valid |= VALID_ETHERADDR;
547 dev->ether_addr_error = 0;
551 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
556 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
557 void *aux OVS_UNUSED)
559 struct netdev_dev_linux *dev;
561 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
563 const struct netdev_class *netdev_class =
564 netdev_dev_get_class(base_dev);
566 if (is_netdev_linux_class(netdev_class)) {
567 dev = netdev_dev_linux_cast(base_dev);
568 netdev_dev_linux_update(dev, change);
572 struct shash device_shash;
573 struct shash_node *node;
575 shash_init(&device_shash);
576 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
577 SHASH_FOR_EACH (node, &device_shash) {
582 get_flags(&dev->netdev_dev, &flags);
583 netdev_dev_linux_changed(dev, flags, 0);
585 shash_destroy(&device_shash);
590 cache_notifier_ref(void)
592 if (!cache_notifier_refcount) {
593 assert(!netdev_linux_cache_notifier);
595 netdev_linux_cache_notifier =
596 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
598 if (!netdev_linux_cache_notifier) {
602 cache_notifier_refcount++;
608 cache_notifier_unref(void)
610 assert(cache_notifier_refcount > 0);
611 if (!--cache_notifier_refcount) {
612 assert(netdev_linux_cache_notifier);
613 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
614 netdev_linux_cache_notifier = NULL;
618 /* Creates system and internal devices. */
620 netdev_linux_create(const struct netdev_class *class, const char *name,
621 struct netdev_dev **netdev_devp)
623 struct netdev_dev_linux *netdev_dev;
626 error = cache_notifier_ref();
631 netdev_dev = xzalloc(sizeof *netdev_dev);
632 netdev_dev->change_seq = 1;
633 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
634 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
636 *netdev_devp = &netdev_dev->netdev_dev;
640 /* For most types of netdevs we open the device for each call of
641 * netdev_open(). However, this is not the case with tap devices,
642 * since it is only possible to open the device once. In this
643 * situation we share a single file descriptor, and consequently
644 * buffers, across all readers. Therefore once data is read it will
645 * be unavailable to other reads for tap devices. */
647 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
648 const char *name, struct netdev_dev **netdev_devp)
650 struct netdev_dev_linux *netdev_dev;
651 struct tap_state *state;
652 static const char tap_dev[] = "/dev/net/tun";
656 netdev_dev = xzalloc(sizeof *netdev_dev);
657 state = &netdev_dev->state.tap;
659 error = cache_notifier_ref();
664 /* Open tap device. */
665 state->fd = open(tap_dev, O_RDWR);
668 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
669 goto error_unref_notifier;
672 /* Create tap device. */
673 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
674 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
675 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
676 VLOG_WARN("%s: creating tap device failed: %s", name,
679 goto error_unref_notifier;
682 /* Make non-blocking. */
683 error = set_nonblocking(state->fd);
685 goto error_unref_notifier;
688 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
689 *netdev_devp = &netdev_dev->netdev_dev;
692 error_unref_notifier:
693 cache_notifier_unref();
700 destroy_tap(struct netdev_dev_linux *netdev_dev)
702 struct tap_state *state = &netdev_dev->state.tap;
704 if (state->fd >= 0) {
709 /* Destroys the netdev device 'netdev_dev_'. */
711 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
713 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
714 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
716 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
717 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
720 if (class == &netdev_tap_class) {
721 destroy_tap(netdev_dev);
725 cache_notifier_unref();
729 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
731 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
732 struct netdev_linux *netdev;
733 enum netdev_flags flags;
736 /* Allocate network device. */
737 netdev = xzalloc(sizeof *netdev);
739 netdev_init(&netdev->netdev, netdev_dev_);
741 /* Verify that the device really exists, by attempting to read its flags.
742 * (The flags might be cached, in which case this won't actually do an
745 * Don't do this for "internal" netdevs, though, because those have to be
746 * created as netdev objects before they exist in the kernel, because
747 * creating them in the kernel happens by passing a netdev object to
748 * dpif_port_add(). */
749 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
750 error = netdev_get_flags(&netdev->netdev, &flags);
751 if (error == ENODEV) {
756 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
757 !netdev_dev->state.tap.opened) {
759 /* We assume that the first user of the tap device is the primary user
760 * and give them the tap FD. Subsequent users probably just expect
761 * this to be a system device so open it normally to avoid send/receive
762 * directions appearing to be reversed. */
763 netdev->fd = netdev_dev->state.tap.fd;
764 netdev_dev->state.tap.opened = true;
767 *netdevp = &netdev->netdev;
771 netdev_uninit(&netdev->netdev, true);
775 /* Closes and destroys 'netdev'. */
777 netdev_linux_close(struct netdev *netdev_)
779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
781 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
788 netdev_linux_listen(struct netdev *netdev_)
790 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
791 struct sockaddr_ll sll;
796 if (netdev->fd >= 0) {
800 /* Create file descriptor. */
801 fd = socket(PF_PACKET, SOCK_RAW, 0);
804 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
808 /* Set non-blocking mode. */
809 error = set_nonblocking(fd);
814 /* Get ethernet device index. */
815 error = get_ifindex(&netdev->netdev, &ifindex);
820 /* Bind to specific ethernet device. */
821 memset(&sll, 0, sizeof sll);
822 sll.sll_family = AF_PACKET;
823 sll.sll_ifindex = ifindex;
824 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
825 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
827 VLOG_ERR("%s: failed to bind raw socket (%s)",
828 netdev_get_name(netdev_), strerror(error));
843 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
845 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
847 if (netdev->fd < 0) {
848 /* Device is not listening. */
855 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
856 ? read(netdev->fd, data, size)
857 : recv(netdev->fd, data, size, MSG_TRUNC));
859 return retval <= size ? retval : -EMSGSIZE;
860 } else if (errno != EINTR) {
861 if (errno != EAGAIN) {
862 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
863 strerror(errno), netdev_get_name(netdev_));
870 /* Registers with the poll loop to wake up from the next call to poll_block()
871 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
873 netdev_linux_recv_wait(struct netdev *netdev_)
875 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
876 if (netdev->fd >= 0) {
877 poll_fd_wait(netdev->fd, POLLIN);
881 /* Discards all packets waiting to be received from 'netdev'. */
883 netdev_linux_drain(struct netdev *netdev_)
885 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
886 if (netdev->fd < 0) {
888 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
890 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
891 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
895 drain_fd(netdev->fd, ifr.ifr_qlen);
898 return drain_rcvbuf(netdev->fd);
902 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
903 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
904 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
905 * the packet is too big or too small to transmit on the device.
907 * The caller retains ownership of 'buffer' in all cases.
909 * The kernel maintains a packet transmission queue, so the caller is not
910 * expected to do additional queuing of packets. */
912 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
914 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
918 if (netdev->fd < 0) {
919 /* Use our AF_PACKET socket to send to this device. */
920 struct sockaddr_ll sll;
927 sock = af_packet_sock();
932 error = get_ifindex(netdev_, &ifindex);
937 /* We don't bother setting most fields in sockaddr_ll because the
938 * kernel ignores them for SOCK_RAW. */
939 memset(&sll, 0, sizeof sll);
940 sll.sll_family = AF_PACKET;
941 sll.sll_ifindex = ifindex;
943 iov.iov_base = (void *) data;
947 msg.msg_namelen = sizeof sll;
950 msg.msg_control = NULL;
951 msg.msg_controllen = 0;
954 retval = sendmsg(sock, &msg, 0);
956 /* Use the netdev's own fd to send to this device. This is
957 * essential for tap devices, because packets sent to a tap device
958 * with an AF_PACKET socket will loop back to be *received* again
959 * on the tap device. */
960 retval = write(netdev->fd, data, size);
964 /* The Linux AF_PACKET implementation never blocks waiting for room
965 * for packets, instead returning ENOBUFS. Translate this into
966 * EAGAIN for the caller. */
967 if (errno == ENOBUFS) {
969 } else if (errno == EINTR) {
971 } else if (errno != EAGAIN) {
972 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
973 netdev_get_name(netdev_), strerror(errno));
976 } else if (retval != size) {
977 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
978 "%zu) on %s", retval, size, netdev_get_name(netdev_));
986 /* Registers with the poll loop to wake up from the next call to poll_block()
987 * when the packet transmission queue has sufficient room to transmit a packet
988 * with netdev_send().
990 * The kernel maintains a packet transmission queue, so the client is not
991 * expected to do additional queuing of packets. Thus, this function is
992 * unlikely to ever be used. It is included for completeness. */
994 netdev_linux_send_wait(struct netdev *netdev_)
996 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
997 if (netdev->fd < 0) {
999 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1000 poll_fd_wait(netdev->fd, POLLOUT);
1002 /* TAP device always accepts packets.*/
1003 poll_immediate_wake();
1007 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1008 * otherwise a positive errno value. */
1010 netdev_linux_set_etheraddr(struct netdev *netdev_,
1011 const uint8_t mac[ETH_ADDR_LEN])
1013 struct netdev_dev_linux *netdev_dev =
1014 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1017 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1018 if (netdev_dev->ether_addr_error) {
1019 return netdev_dev->ether_addr_error;
1021 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1024 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1027 error = set_etheraddr(netdev_get_name(netdev_), mac);
1028 if (!error || error == ENODEV) {
1029 netdev_dev->ether_addr_error = error;
1030 netdev_dev->cache_valid |= VALID_ETHERADDR;
1032 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1039 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1041 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1042 uint8_t mac[ETH_ADDR_LEN])
1044 struct netdev_dev_linux *netdev_dev =
1045 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1047 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1048 int error = get_etheraddr(netdev_get_name(netdev_),
1049 netdev_dev->etheraddr);
1051 netdev_dev->ether_addr_error = error;
1052 netdev_dev->cache_valid |= VALID_ETHERADDR;
1055 if (!netdev_dev->ether_addr_error) {
1056 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1059 return netdev_dev->ether_addr_error;
1062 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1063 * in bytes, not including the hardware header; thus, this is typically 1500
1064 * bytes for Ethernet devices. */
1066 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1068 struct netdev_dev_linux *netdev_dev =
1069 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1070 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1074 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1075 SIOCGIFMTU, "SIOCGIFMTU");
1077 netdev_dev->netdev_mtu_error = error;
1078 netdev_dev->mtu = ifr.ifr_mtu;
1079 netdev_dev->cache_valid |= VALID_MTU;
1082 if (!netdev_dev->netdev_mtu_error) {
1083 *mtup = netdev_dev->mtu;
1085 return netdev_dev->netdev_mtu_error;
1088 /* Sets the maximum size of transmitted (MTU) for given device using linux
1089 * networking ioctl interface.
1092 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1094 struct netdev_dev_linux *netdev_dev =
1095 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1099 if (netdev_dev->cache_valid & VALID_MTU) {
1100 if (netdev_dev->netdev_mtu_error) {
1101 return netdev_dev->netdev_mtu_error;
1103 if (netdev_dev->mtu == mtu) {
1106 netdev_dev->cache_valid &= ~VALID_MTU;
1109 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1110 SIOCSIFMTU, "SIOCSIFMTU");
1111 if (!error || error == ENODEV) {
1112 netdev_dev->netdev_mtu_error = error;
1113 netdev_dev->mtu = ifr.ifr_mtu;
1114 netdev_dev->cache_valid |= VALID_MTU;
1119 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1120 * On failure, returns a negative errno value. */
1122 netdev_linux_get_ifindex(const struct netdev *netdev)
1126 error = get_ifindex(netdev, &ifindex);
1127 return error ? -error : ifindex;
1131 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1133 struct netdev_dev_linux *netdev_dev =
1134 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1136 if (netdev_dev->miimon_interval > 0) {
1137 *carrier = netdev_dev->miimon;
1139 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1145 static long long int
1146 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1148 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1152 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1153 struct mii_ioctl_data *data)
1158 memset(&ifr, 0, sizeof ifr);
1159 memcpy(&ifr.ifr_data, data, sizeof *data);
1160 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1161 memcpy(data, &ifr.ifr_data, sizeof *data);
1167 netdev_linux_get_miimon(const char *name, bool *miimon)
1169 struct mii_ioctl_data data;
1174 memset(&data, 0, sizeof data);
1175 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1177 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1178 data.reg_num = MII_BMSR;
1179 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1183 *miimon = !!(data.val_out & BMSR_LSTATUS);
1185 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1188 struct ethtool_cmd ecmd;
1190 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1193 memset(&ecmd, 0, sizeof ecmd);
1194 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1197 struct ethtool_value eval;
1199 memcpy(&eval, &ecmd, sizeof eval);
1200 *miimon = !!eval.data;
1202 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1210 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1211 long long int interval)
1213 struct netdev_dev_linux *netdev_dev;
1215 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1217 interval = interval > 0 ? MAX(interval, 100) : 0;
1218 if (netdev_dev->miimon_interval != interval) {
1219 netdev_dev->miimon_interval = interval;
1220 timer_set_expired(&netdev_dev->miimon_timer);
1227 netdev_linux_miimon_run(void)
1229 struct shash device_shash;
1230 struct shash_node *node;
1232 shash_init(&device_shash);
1233 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1234 SHASH_FOR_EACH (node, &device_shash) {
1235 struct netdev_dev_linux *dev = node->data;
1238 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1242 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1243 if (miimon != dev->miimon) {
1244 dev->miimon = miimon;
1245 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1248 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1251 shash_destroy(&device_shash);
1255 netdev_linux_miimon_wait(void)
1257 struct shash device_shash;
1258 struct shash_node *node;
1260 shash_init(&device_shash);
1261 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1262 SHASH_FOR_EACH (node, &device_shash) {
1263 struct netdev_dev_linux *dev = node->data;
1265 if (dev->miimon_interval > 0) {
1266 timer_wait(&dev->miimon_timer);
1269 shash_destroy(&device_shash);
1272 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1273 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1276 check_for_working_netlink_stats(void)
1278 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1279 * preferable, so if that works, we'll use it. */
1280 int ifindex = do_get_ifindex("lo");
1282 VLOG_WARN("failed to get ifindex for lo, "
1283 "obtaining netdev stats from proc");
1286 struct netdev_stats stats;
1287 int error = get_stats_via_netlink(ifindex, &stats);
1289 VLOG_DBG("obtaining netdev stats via rtnetlink");
1292 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1293 "via proc (you are probably running a pre-2.6.19 "
1294 "kernel)", strerror(error));
1301 swap_uint64(uint64_t *a, uint64_t *b)
1309 get_stats_via_vport(const struct netdev *netdev_,
1310 struct netdev_stats *stats)
1312 struct netdev_dev_linux *netdev_dev =
1313 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1315 if (!netdev_dev->vport_stats_error ||
1316 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1319 error = netdev_vport_get_stats(netdev_, stats);
1321 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1322 "(%s)", netdev_get_name(netdev_), strerror(error));
1324 netdev_dev->vport_stats_error = error;
1325 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1330 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1331 struct netdev_stats *stats)
1333 static int use_netlink_stats = -1;
1336 if (use_netlink_stats < 0) {
1337 use_netlink_stats = check_for_working_netlink_stats();
1340 if (use_netlink_stats) {
1343 error = get_ifindex(netdev_, &ifindex);
1345 error = get_stats_via_netlink(ifindex, stats);
1348 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1352 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1353 netdev_get_name(netdev_), error);
1359 /* Retrieves current device stats for 'netdev-linux'. */
1361 netdev_linux_get_stats(const struct netdev *netdev_,
1362 struct netdev_stats *stats)
1364 struct netdev_dev_linux *netdev_dev =
1365 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1366 struct netdev_stats dev_stats;
1369 get_stats_via_vport(netdev_, stats);
1371 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1374 if (netdev_dev->vport_stats_error) {
1381 if (netdev_dev->vport_stats_error) {
1382 /* stats not available from OVS then use ioctl stats. */
1385 stats->rx_errors += dev_stats.rx_errors;
1386 stats->tx_errors += dev_stats.tx_errors;
1387 stats->rx_dropped += dev_stats.rx_dropped;
1388 stats->tx_dropped += dev_stats.tx_dropped;
1389 stats->multicast += dev_stats.multicast;
1390 stats->collisions += dev_stats.collisions;
1391 stats->rx_length_errors += dev_stats.rx_length_errors;
1392 stats->rx_over_errors += dev_stats.rx_over_errors;
1393 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1394 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1395 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1396 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1397 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1398 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1399 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1400 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1401 stats->tx_window_errors += dev_stats.tx_window_errors;
1406 /* Retrieves current device stats for 'netdev-tap' netdev or
1407 * netdev-internal. */
1409 netdev_tap_get_stats(const struct netdev *netdev_,
1410 struct netdev_stats *stats)
1412 struct netdev_dev_linux *netdev_dev =
1413 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1414 struct netdev_stats dev_stats;
1417 get_stats_via_vport(netdev_, stats);
1419 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1421 if (netdev_dev->vport_stats_error) {
1428 /* If this port is an internal port then the transmit and receive stats
1429 * will appear to be swapped relative to the other ports since we are the
1430 * one sending the data, not a remote computer. For consistency, we swap
1431 * them back here. This does not apply if we are getting stats from the
1432 * vport layer because it always tracks stats from the perspective of the
1434 if (netdev_dev->vport_stats_error) {
1436 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1437 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1438 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1439 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1440 stats->rx_length_errors = 0;
1441 stats->rx_over_errors = 0;
1442 stats->rx_crc_errors = 0;
1443 stats->rx_frame_errors = 0;
1444 stats->rx_fifo_errors = 0;
1445 stats->rx_missed_errors = 0;
1446 stats->tx_aborted_errors = 0;
1447 stats->tx_carrier_errors = 0;
1448 stats->tx_fifo_errors = 0;
1449 stats->tx_heartbeat_errors = 0;
1450 stats->tx_window_errors = 0;
1452 stats->rx_dropped += dev_stats.tx_dropped;
1453 stats->tx_dropped += dev_stats.rx_dropped;
1455 stats->rx_errors += dev_stats.tx_errors;
1456 stats->tx_errors += dev_stats.rx_errors;
1458 stats->multicast += dev_stats.multicast;
1459 stats->collisions += dev_stats.collisions;
1465 netdev_internal_get_stats(const struct netdev *netdev_,
1466 struct netdev_stats *stats)
1468 struct netdev_dev_linux *netdev_dev =
1469 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1471 get_stats_via_vport(netdev_, stats);
1472 return netdev_dev->vport_stats_error;
1475 /* Stores the features supported by 'netdev' into each of '*current',
1476 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1477 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1480 netdev_linux_get_features(const struct netdev *netdev,
1481 enum netdev_features *current,
1482 enum netdev_features *advertised,
1483 enum netdev_features *supported,
1484 enum netdev_features *peer)
1486 struct ethtool_cmd ecmd;
1490 memset(&ecmd, 0, sizeof ecmd);
1491 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1492 ETHTOOL_GSET, "ETHTOOL_GSET");
1497 /* Supported features. */
1499 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1500 *supported |= NETDEV_F_10MB_HD;
1502 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1503 *supported |= NETDEV_F_10MB_FD;
1505 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1506 *supported |= NETDEV_F_100MB_HD;
1508 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1509 *supported |= NETDEV_F_100MB_FD;
1511 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1512 *supported |= NETDEV_F_1GB_HD;
1514 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1515 *supported |= NETDEV_F_1GB_FD;
1517 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1518 *supported |= NETDEV_F_10GB_FD;
1520 if (ecmd.supported & SUPPORTED_TP) {
1521 *supported |= NETDEV_F_COPPER;
1523 if (ecmd.supported & SUPPORTED_FIBRE) {
1524 *supported |= NETDEV_F_FIBER;
1526 if (ecmd.supported & SUPPORTED_Autoneg) {
1527 *supported |= NETDEV_F_AUTONEG;
1529 if (ecmd.supported & SUPPORTED_Pause) {
1530 *supported |= NETDEV_F_PAUSE;
1532 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1533 *supported |= NETDEV_F_PAUSE_ASYM;
1536 /* Advertised features. */
1538 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1539 *advertised |= NETDEV_F_10MB_HD;
1541 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1542 *advertised |= NETDEV_F_10MB_FD;
1544 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1545 *advertised |= NETDEV_F_100MB_HD;
1547 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1548 *advertised |= NETDEV_F_100MB_FD;
1550 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1551 *advertised |= NETDEV_F_1GB_HD;
1553 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1554 *advertised |= NETDEV_F_1GB_FD;
1556 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1557 *advertised |= NETDEV_F_10GB_FD;
1559 if (ecmd.advertising & ADVERTISED_TP) {
1560 *advertised |= NETDEV_F_COPPER;
1562 if (ecmd.advertising & ADVERTISED_FIBRE) {
1563 *advertised |= NETDEV_F_FIBER;
1565 if (ecmd.advertising & ADVERTISED_Autoneg) {
1566 *advertised |= NETDEV_F_AUTONEG;
1568 if (ecmd.advertising & ADVERTISED_Pause) {
1569 *advertised |= NETDEV_F_PAUSE;
1571 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1572 *advertised |= NETDEV_F_PAUSE_ASYM;
1575 /* Current settings. */
1577 if (speed == SPEED_10) {
1578 *current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1579 } else if (speed == SPEED_100) {
1580 *current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1581 } else if (speed == SPEED_1000) {
1582 *current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1583 } else if (speed == SPEED_10000) {
1584 *current = NETDEV_F_10GB_FD;
1585 } else if (speed == 40000) {
1586 *current = NETDEV_F_40GB_FD;
1587 } else if (speed == 100000) {
1588 *current = NETDEV_F_100GB_FD;
1589 } else if (speed == 1000000) {
1590 *current = NETDEV_F_1TB_FD;
1595 if (ecmd.port == PORT_TP) {
1596 *current |= NETDEV_F_COPPER;
1597 } else if (ecmd.port == PORT_FIBRE) {
1598 *current |= NETDEV_F_FIBER;
1602 *current |= NETDEV_F_AUTONEG;
1605 /* Peer advertisements. */
1606 *peer = 0; /* XXX */
1611 /* Set the features advertised by 'netdev' to 'advertise'. */
1613 netdev_linux_set_advertisements(struct netdev *netdev,
1614 enum netdev_features advertise)
1616 struct ethtool_cmd ecmd;
1619 memset(&ecmd, 0, sizeof ecmd);
1620 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1621 ETHTOOL_GSET, "ETHTOOL_GSET");
1626 ecmd.advertising = 0;
1627 if (advertise & NETDEV_F_10MB_HD) {
1628 ecmd.advertising |= ADVERTISED_10baseT_Half;
1630 if (advertise & NETDEV_F_10MB_FD) {
1631 ecmd.advertising |= ADVERTISED_10baseT_Full;
1633 if (advertise & NETDEV_F_100MB_HD) {
1634 ecmd.advertising |= ADVERTISED_100baseT_Half;
1636 if (advertise & NETDEV_F_100MB_FD) {
1637 ecmd.advertising |= ADVERTISED_100baseT_Full;
1639 if (advertise & NETDEV_F_1GB_HD) {
1640 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1642 if (advertise & NETDEV_F_1GB_FD) {
1643 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1645 if (advertise & NETDEV_F_10GB_FD) {
1646 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1648 if (advertise & NETDEV_F_COPPER) {
1649 ecmd.advertising |= ADVERTISED_TP;
1651 if (advertise & NETDEV_F_FIBER) {
1652 ecmd.advertising |= ADVERTISED_FIBRE;
1654 if (advertise & NETDEV_F_AUTONEG) {
1655 ecmd.advertising |= ADVERTISED_Autoneg;
1657 if (advertise & NETDEV_F_PAUSE) {
1658 ecmd.advertising |= ADVERTISED_Pause;
1660 if (advertise & NETDEV_F_PAUSE_ASYM) {
1661 ecmd.advertising |= ADVERTISED_Asym_Pause;
1663 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1664 ETHTOOL_SSET, "ETHTOOL_SSET");
1667 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1668 * successful, otherwise a positive errno value. */
1670 netdev_linux_set_policing(struct netdev *netdev,
1671 uint32_t kbits_rate, uint32_t kbits_burst)
1673 struct netdev_dev_linux *netdev_dev =
1674 netdev_dev_linux_cast(netdev_get_dev(netdev));
1675 const char *netdev_name = netdev_get_name(netdev);
1679 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1680 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1681 : kbits_burst); /* Stick with user-specified value. */
1683 if (netdev_dev->cache_valid & VALID_POLICING
1684 && netdev_dev->kbits_rate == kbits_rate
1685 && netdev_dev->kbits_burst == kbits_burst) {
1686 /* Assume that settings haven't changed since we last set them. */
1690 COVERAGE_INC(netdev_set_policing);
1691 /* Remove any existing ingress qdisc. */
1692 error = tc_add_del_ingress_qdisc(netdev, false);
1694 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1695 netdev_name, strerror(error));
1700 error = tc_add_del_ingress_qdisc(netdev, true);
1702 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1703 netdev_name, strerror(error));
1707 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1709 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1710 netdev_name, strerror(error));
1715 netdev_dev->kbits_rate = kbits_rate;
1716 netdev_dev->kbits_burst = kbits_burst;
1717 netdev_dev->cache_valid |= VALID_POLICING;
1723 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1726 const struct tc_ops **opsp;
1728 for (opsp = tcs; *opsp != NULL; opsp++) {
1729 const struct tc_ops *ops = *opsp;
1730 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1731 sset_add(types, ops->ovs_name);
1737 static const struct tc_ops *
1738 tc_lookup_ovs_name(const char *name)
1740 const struct tc_ops **opsp;
1742 for (opsp = tcs; *opsp != NULL; opsp++) {
1743 const struct tc_ops *ops = *opsp;
1744 if (!strcmp(name, ops->ovs_name)) {
1751 static const struct tc_ops *
1752 tc_lookup_linux_name(const char *name)
1754 const struct tc_ops **opsp;
1756 for (opsp = tcs; *opsp != NULL; opsp++) {
1757 const struct tc_ops *ops = *opsp;
1758 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1765 static struct tc_queue *
1766 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1769 struct netdev_dev_linux *netdev_dev =
1770 netdev_dev_linux_cast(netdev_get_dev(netdev));
1771 struct tc_queue *queue;
1773 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1774 if (queue->queue_id == queue_id) {
1781 static struct tc_queue *
1782 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1784 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1788 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1790 struct netdev_qos_capabilities *caps)
1792 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1796 caps->n_queues = ops->n_queues;
1801 netdev_linux_get_qos(const struct netdev *netdev,
1802 const char **typep, struct shash *details)
1804 struct netdev_dev_linux *netdev_dev =
1805 netdev_dev_linux_cast(netdev_get_dev(netdev));
1808 error = tc_query_qdisc(netdev);
1813 *typep = netdev_dev->tc->ops->ovs_name;
1814 return (netdev_dev->tc->ops->qdisc_get
1815 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1820 netdev_linux_set_qos(struct netdev *netdev,
1821 const char *type, const struct shash *details)
1823 struct netdev_dev_linux *netdev_dev =
1824 netdev_dev_linux_cast(netdev_get_dev(netdev));
1825 const struct tc_ops *new_ops;
1828 new_ops = tc_lookup_ovs_name(type);
1829 if (!new_ops || !new_ops->tc_install) {
1833 error = tc_query_qdisc(netdev);
1838 if (new_ops == netdev_dev->tc->ops) {
1839 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1841 /* Delete existing qdisc. */
1842 error = tc_del_qdisc(netdev);
1846 assert(netdev_dev->tc == NULL);
1848 /* Install new qdisc. */
1849 error = new_ops->tc_install(netdev, details);
1850 assert((error == 0) == (netdev_dev->tc != NULL));
1857 netdev_linux_get_queue(const struct netdev *netdev,
1858 unsigned int queue_id, struct shash *details)
1860 struct netdev_dev_linux *netdev_dev =
1861 netdev_dev_linux_cast(netdev_get_dev(netdev));
1864 error = tc_query_qdisc(netdev);
1868 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1870 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1876 netdev_linux_set_queue(struct netdev *netdev,
1877 unsigned int queue_id, const struct shash *details)
1879 struct netdev_dev_linux *netdev_dev =
1880 netdev_dev_linux_cast(netdev_get_dev(netdev));
1883 error = tc_query_qdisc(netdev);
1886 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1887 || !netdev_dev->tc->ops->class_set) {
1891 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1895 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1897 struct netdev_dev_linux *netdev_dev =
1898 netdev_dev_linux_cast(netdev_get_dev(netdev));
1901 error = tc_query_qdisc(netdev);
1904 } else if (!netdev_dev->tc->ops->class_delete) {
1907 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1909 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1915 netdev_linux_get_queue_stats(const struct netdev *netdev,
1916 unsigned int queue_id,
1917 struct netdev_queue_stats *stats)
1919 struct netdev_dev_linux *netdev_dev =
1920 netdev_dev_linux_cast(netdev_get_dev(netdev));
1923 error = tc_query_qdisc(netdev);
1926 } else if (!netdev_dev->tc->ops->class_get_stats) {
1929 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1931 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1937 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1939 struct ofpbuf request;
1940 struct tcmsg *tcmsg;
1942 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1946 tcmsg->tcm_parent = 0;
1947 nl_dump_start(dump, rtnl_sock, &request);
1948 ofpbuf_uninit(&request);
1953 netdev_linux_dump_queues(const struct netdev *netdev,
1954 netdev_dump_queues_cb *cb, void *aux)
1956 struct netdev_dev_linux *netdev_dev =
1957 netdev_dev_linux_cast(netdev_get_dev(netdev));
1958 struct tc_queue *queue;
1959 struct shash details;
1963 error = tc_query_qdisc(netdev);
1966 } else if (!netdev_dev->tc->ops->class_get) {
1971 shash_init(&details);
1972 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1973 shash_clear(&details);
1975 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1977 (*cb)(queue->queue_id, &details, aux);
1982 shash_destroy(&details);
1988 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1989 netdev_dump_queue_stats_cb *cb, void *aux)
1991 struct netdev_dev_linux *netdev_dev =
1992 netdev_dev_linux_cast(netdev_get_dev(netdev));
1993 struct nl_dump dump;
1998 error = tc_query_qdisc(netdev);
2001 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2006 if (!start_queue_dump(netdev, &dump)) {
2009 while (nl_dump_next(&dump, &msg)) {
2010 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2016 error = nl_dump_done(&dump);
2017 return error ? error : last_error;
2021 netdev_linux_get_in4(const struct netdev *netdev_,
2022 struct in_addr *address, struct in_addr *netmask)
2024 struct netdev_dev_linux *netdev_dev =
2025 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2027 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2030 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2031 SIOCGIFADDR, "SIOCGIFADDR");
2036 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2037 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2042 netdev_dev->cache_valid |= VALID_IN4;
2044 *address = netdev_dev->address;
2045 *netmask = netdev_dev->netmask;
2046 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2050 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2051 struct in_addr netmask)
2053 struct netdev_dev_linux *netdev_dev =
2054 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2057 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2059 netdev_dev->cache_valid |= VALID_IN4;
2060 netdev_dev->address = address;
2061 netdev_dev->netmask = netmask;
2062 if (address.s_addr != INADDR_ANY) {
2063 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2064 "SIOCSIFNETMASK", netmask);
2071 parse_if_inet6_line(const char *line,
2072 struct in6_addr *in6, char ifname[16 + 1])
2074 uint8_t *s6 = in6->s6_addr;
2075 #define X8 "%2"SCNx8
2077 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2078 "%*x %*x %*x %*x %16s\n",
2079 &s6[0], &s6[1], &s6[2], &s6[3],
2080 &s6[4], &s6[5], &s6[6], &s6[7],
2081 &s6[8], &s6[9], &s6[10], &s6[11],
2082 &s6[12], &s6[13], &s6[14], &s6[15],
2086 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2087 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2089 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2091 struct netdev_dev_linux *netdev_dev =
2092 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2093 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2097 netdev_dev->in6 = in6addr_any;
2099 file = fopen("/proc/net/if_inet6", "r");
2101 const char *name = netdev_get_name(netdev_);
2102 while (fgets(line, sizeof line, file)) {
2103 struct in6_addr in6_tmp;
2104 char ifname[16 + 1];
2105 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2106 && !strcmp(name, ifname))
2108 netdev_dev->in6 = in6_tmp;
2114 netdev_dev->cache_valid |= VALID_IN6;
2116 *in6 = netdev_dev->in6;
2121 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2123 struct sockaddr_in sin;
2124 memset(&sin, 0, sizeof sin);
2125 sin.sin_family = AF_INET;
2126 sin.sin_addr = addr;
2129 memset(sa, 0, sizeof *sa);
2130 memcpy(sa, &sin, sizeof sin);
2134 do_set_addr(struct netdev *netdev,
2135 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2138 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2139 make_in4_sockaddr(&ifr.ifr_addr, addr);
2141 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2145 /* Adds 'router' as a default IP gateway. */
2147 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2149 struct in_addr any = { INADDR_ANY };
2153 memset(&rt, 0, sizeof rt);
2154 make_in4_sockaddr(&rt.rt_dst, any);
2155 make_in4_sockaddr(&rt.rt_gateway, router);
2156 make_in4_sockaddr(&rt.rt_genmask, any);
2157 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2158 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2160 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2166 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2169 static const char fn[] = "/proc/net/route";
2174 *netdev_name = NULL;
2175 stream = fopen(fn, "r");
2176 if (stream == NULL) {
2177 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2182 while (fgets(line, sizeof line, stream)) {
2185 ovs_be32 dest, gateway, mask;
2186 int refcnt, metric, mtu;
2187 unsigned int flags, use, window, irtt;
2190 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2192 iface, &dest, &gateway, &flags, &refcnt,
2193 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2195 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2199 if (!(flags & RTF_UP)) {
2200 /* Skip routes that aren't up. */
2204 /* The output of 'dest', 'mask', and 'gateway' were given in
2205 * network byte order, so we don't need need any endian
2206 * conversions here. */
2207 if ((dest & mask) == (host->s_addr & mask)) {
2209 /* The host is directly reachable. */
2210 next_hop->s_addr = 0;
2212 /* To reach the host, we must go through a gateway. */
2213 next_hop->s_addr = gateway;
2215 *netdev_name = xstrdup(iface);
2227 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2230 struct netdev_dev_linux *netdev_dev =
2231 netdev_dev_linux_cast(netdev_get_dev(netdev));
2233 error = netdev_linux_get_drvinfo(netdev_dev);
2235 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2236 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2237 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2243 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2245 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2249 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2250 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2251 * returns 0. Otherwise, it returns a positive errno value; in particular,
2252 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2254 netdev_linux_arp_lookup(const struct netdev *netdev,
2255 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2258 struct sockaddr_in sin;
2261 memset(&r, 0, sizeof r);
2262 memset(&sin, 0, sizeof sin);
2263 sin.sin_family = AF_INET;
2264 sin.sin_addr.s_addr = ip;
2266 memcpy(&r.arp_pa, &sin, sizeof sin);
2267 r.arp_ha.sa_family = ARPHRD_ETHER;
2269 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2270 COVERAGE_INC(netdev_arp_lookup);
2271 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2273 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2274 } else if (retval != ENXIO) {
2275 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2276 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2282 nd_to_iff_flags(enum netdev_flags nd)
2285 if (nd & NETDEV_UP) {
2288 if (nd & NETDEV_PROMISC) {
2295 iff_to_nd_flags(int iff)
2297 enum netdev_flags nd = 0;
2301 if (iff & IFF_PROMISC) {
2302 nd |= NETDEV_PROMISC;
2308 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2309 enum netdev_flags on, enum netdev_flags *old_flagsp)
2311 struct netdev_dev_linux *netdev_dev;
2312 int old_flags, new_flags;
2315 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2316 old_flags = netdev_dev->ifi_flags;
2317 *old_flagsp = iff_to_nd_flags(old_flags);
2318 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2319 if (new_flags != old_flags) {
2320 error = set_flags(netdev, new_flags);
2321 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2327 netdev_linux_change_seq(const struct netdev *netdev)
2329 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2332 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2337 netdev_linux_init, \
2339 netdev_linux_wait, \
2342 netdev_linux_destroy, \
2343 NULL, /* get_config */ \
2344 NULL, /* set_config */ \
2346 netdev_linux_open, \
2347 netdev_linux_close, \
2349 netdev_linux_listen, \
2350 netdev_linux_recv, \
2351 netdev_linux_recv_wait, \
2352 netdev_linux_drain, \
2354 netdev_linux_send, \
2355 netdev_linux_send_wait, \
2357 netdev_linux_set_etheraddr, \
2358 netdev_linux_get_etheraddr, \
2359 netdev_linux_get_mtu, \
2360 netdev_linux_set_mtu, \
2361 netdev_linux_get_ifindex, \
2362 netdev_linux_get_carrier, \
2363 netdev_linux_get_carrier_resets, \
2364 netdev_linux_set_miimon_interval, \
2368 netdev_linux_get_features, \
2369 netdev_linux_set_advertisements, \
2371 netdev_linux_set_policing, \
2372 netdev_linux_get_qos_types, \
2373 netdev_linux_get_qos_capabilities, \
2374 netdev_linux_get_qos, \
2375 netdev_linux_set_qos, \
2376 netdev_linux_get_queue, \
2377 netdev_linux_set_queue, \
2378 netdev_linux_delete_queue, \
2379 netdev_linux_get_queue_stats, \
2380 netdev_linux_dump_queues, \
2381 netdev_linux_dump_queue_stats, \
2383 netdev_linux_get_in4, \
2384 netdev_linux_set_in4, \
2385 netdev_linux_get_in6, \
2386 netdev_linux_add_router, \
2387 netdev_linux_get_next_hop, \
2389 netdev_linux_arp_lookup, \
2391 netdev_linux_update_flags, \
2393 netdev_linux_change_seq \
2396 const struct netdev_class netdev_linux_class =
2399 netdev_linux_create,
2400 netdev_linux_get_stats,
2401 NULL, /* set_stats */
2402 netdev_linux_get_status);
2404 const struct netdev_class netdev_tap_class =
2407 netdev_linux_create_tap,
2408 netdev_tap_get_stats,
2409 NULL, /* set_stats */
2410 netdev_linux_get_status);
2412 const struct netdev_class netdev_internal_class =
2415 netdev_linux_create,
2416 netdev_internal_get_stats,
2417 netdev_vport_set_stats,
2418 netdev_internal_get_status);
2420 /* HTB traffic control class. */
2422 #define HTB_N_QUEUES 0xf000
2426 unsigned int max_rate; /* In bytes/s. */
2430 struct tc_queue tc_queue;
2431 unsigned int min_rate; /* In bytes/s. */
2432 unsigned int max_rate; /* In bytes/s. */
2433 unsigned int burst; /* In bytes. */
2434 unsigned int priority; /* Lower values are higher priorities. */
2438 htb_get__(const struct netdev *netdev)
2440 struct netdev_dev_linux *netdev_dev =
2441 netdev_dev_linux_cast(netdev_get_dev(netdev));
2442 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2446 htb_install__(struct netdev *netdev, uint64_t max_rate)
2448 struct netdev_dev_linux *netdev_dev =
2449 netdev_dev_linux_cast(netdev_get_dev(netdev));
2452 htb = xmalloc(sizeof *htb);
2453 tc_init(&htb->tc, &tc_ops_htb);
2454 htb->max_rate = max_rate;
2456 netdev_dev->tc = &htb->tc;
2459 /* Create an HTB qdisc.
2461 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2463 htb_setup_qdisc__(struct netdev *netdev)
2466 struct tc_htb_glob opt;
2467 struct ofpbuf request;
2468 struct tcmsg *tcmsg;
2470 tc_del_qdisc(netdev);
2472 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2473 NLM_F_EXCL | NLM_F_CREATE, &request);
2477 tcmsg->tcm_handle = tc_make_handle(1, 0);
2478 tcmsg->tcm_parent = TC_H_ROOT;
2480 nl_msg_put_string(&request, TCA_KIND, "htb");
2482 memset(&opt, 0, sizeof opt);
2483 opt.rate2quantum = 10;
2487 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2488 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2489 nl_msg_end_nested(&request, opt_offset);
2491 return tc_transact(&request, NULL);
2494 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2495 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2497 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2498 unsigned int parent, struct htb_class *class)
2501 struct tc_htb_opt opt;
2502 struct ofpbuf request;
2503 struct tcmsg *tcmsg;
2507 error = netdev_get_mtu(netdev, &mtu);
2509 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2510 netdev_get_name(netdev));
2514 memset(&opt, 0, sizeof opt);
2515 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2516 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2517 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2518 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2519 opt.prio = class->priority;
2521 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2525 tcmsg->tcm_handle = handle;
2526 tcmsg->tcm_parent = parent;
2528 nl_msg_put_string(&request, TCA_KIND, "htb");
2529 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2530 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2531 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2532 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2533 nl_msg_end_nested(&request, opt_offset);
2535 error = tc_transact(&request, NULL);
2537 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2538 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2539 netdev_get_name(netdev),
2540 tc_get_major(handle), tc_get_minor(handle),
2541 tc_get_major(parent), tc_get_minor(parent),
2542 class->min_rate, class->max_rate,
2543 class->burst, class->priority, strerror(error));
2548 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2549 * description of them into 'details'. The description complies with the
2550 * specification given in the vswitch database documentation for linux-htb
2553 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2555 static const struct nl_policy tca_htb_policy[] = {
2556 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2557 .min_len = sizeof(struct tc_htb_opt) },
2560 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2561 const struct tc_htb_opt *htb;
2563 if (!nl_parse_nested(nl_options, tca_htb_policy,
2564 attrs, ARRAY_SIZE(tca_htb_policy))) {
2565 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2569 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2570 class->min_rate = htb->rate.rate;
2571 class->max_rate = htb->ceil.rate;
2572 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2573 class->priority = htb->prio;
2578 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2579 struct htb_class *options,
2580 struct netdev_queue_stats *stats)
2582 struct nlattr *nl_options;
2583 unsigned int handle;
2586 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2587 if (!error && queue_id) {
2588 unsigned int major = tc_get_major(handle);
2589 unsigned int minor = tc_get_minor(handle);
2590 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2591 *queue_id = minor - 1;
2596 if (!error && options) {
2597 error = htb_parse_tca_options__(nl_options, options);
2603 htb_parse_qdisc_details__(struct netdev *netdev,
2604 const struct shash *details, struct htb_class *hc)
2606 const char *max_rate_s;
2608 max_rate_s = shash_find_data(details, "max-rate");
2609 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2610 if (!hc->max_rate) {
2613 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2614 hc->max_rate = netdev_features_to_bps(current) / 8;
2616 hc->min_rate = hc->max_rate;
2622 htb_parse_class_details__(struct netdev *netdev,
2623 const struct shash *details, struct htb_class *hc)
2625 const struct htb *htb = htb_get__(netdev);
2626 const char *min_rate_s = shash_find_data(details, "min-rate");
2627 const char *max_rate_s = shash_find_data(details, "max-rate");
2628 const char *burst_s = shash_find_data(details, "burst");
2629 const char *priority_s = shash_find_data(details, "priority");
2632 error = netdev_get_mtu(netdev, &mtu);
2634 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2635 netdev_get_name(netdev));
2639 /* HTB requires at least an mtu sized min-rate to send any traffic even
2640 * on uncongested links. */
2641 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2642 hc->min_rate = MAX(hc->min_rate, mtu);
2643 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2646 hc->max_rate = (max_rate_s
2647 ? strtoull(max_rate_s, NULL, 10) / 8
2649 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2650 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2654 * According to hints in the documentation that I've read, it is important
2655 * that 'burst' be at least as big as the largest frame that might be
2656 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2657 * but having it a bit too small is a problem. Since netdev_get_mtu()
2658 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2659 * the MTU. We actually add 64, instead of 14, as a guard against
2660 * additional headers get tacked on somewhere that we're not aware of. */
2661 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2662 hc->burst = MAX(hc->burst, mtu + 64);
2665 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2671 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2672 unsigned int parent, struct htb_class *options,
2673 struct netdev_queue_stats *stats)
2675 struct ofpbuf *reply;
2678 error = tc_query_class(netdev, handle, parent, &reply);
2680 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2681 ofpbuf_delete(reply);
2687 htb_tc_install(struct netdev *netdev, const struct shash *details)
2691 error = htb_setup_qdisc__(netdev);
2693 struct htb_class hc;
2695 htb_parse_qdisc_details__(netdev, details, &hc);
2696 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2697 tc_make_handle(1, 0), &hc);
2699 htb_install__(netdev, hc.max_rate);
2705 static struct htb_class *
2706 htb_class_cast__(const struct tc_queue *queue)
2708 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2712 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2713 const struct htb_class *hc)
2715 struct htb *htb = htb_get__(netdev);
2716 size_t hash = hash_int(queue_id, 0);
2717 struct tc_queue *queue;
2718 struct htb_class *hcp;
2720 queue = tc_find_queue__(netdev, queue_id, hash);
2722 hcp = htb_class_cast__(queue);
2724 hcp = xmalloc(sizeof *hcp);
2725 queue = &hcp->tc_queue;
2726 queue->queue_id = queue_id;
2727 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2730 hcp->min_rate = hc->min_rate;
2731 hcp->max_rate = hc->max_rate;
2732 hcp->burst = hc->burst;
2733 hcp->priority = hc->priority;
2737 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2740 struct nl_dump dump;
2741 struct htb_class hc;
2743 /* Get qdisc options. */
2745 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2746 htb_install__(netdev, hc.max_rate);
2749 if (!start_queue_dump(netdev, &dump)) {
2752 while (nl_dump_next(&dump, &msg)) {
2753 unsigned int queue_id;
2755 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2756 htb_update_queue__(netdev, queue_id, &hc);
2759 nl_dump_done(&dump);
2765 htb_tc_destroy(struct tc *tc)
2767 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2768 struct htb_class *hc, *next;
2770 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2771 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2779 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2781 const struct htb *htb = htb_get__(netdev);
2782 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2787 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2789 struct htb_class hc;
2792 htb_parse_qdisc_details__(netdev, details, &hc);
2793 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2794 tc_make_handle(1, 0), &hc);
2796 htb_get__(netdev)->max_rate = hc.max_rate;
2802 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2803 const struct tc_queue *queue, struct shash *details)
2805 const struct htb_class *hc = htb_class_cast__(queue);
2807 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2808 if (hc->min_rate != hc->max_rate) {
2809 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2811 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2813 shash_add(details, "priority", xasprintf("%u", hc->priority));
2819 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2820 const struct shash *details)
2822 struct htb_class hc;
2825 error = htb_parse_class_details__(netdev, details, &hc);
2830 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2831 tc_make_handle(1, 0xfffe), &hc);
2836 htb_update_queue__(netdev, queue_id, &hc);
2841 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2843 struct htb_class *hc = htb_class_cast__(queue);
2844 struct htb *htb = htb_get__(netdev);
2847 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2849 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2856 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2857 struct netdev_queue_stats *stats)
2859 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2860 tc_make_handle(1, 0xfffe), NULL, stats);
2864 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2865 const struct ofpbuf *nlmsg,
2866 netdev_dump_queue_stats_cb *cb, void *aux)
2868 struct netdev_queue_stats stats;
2869 unsigned int handle, major, minor;
2872 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2877 major = tc_get_major(handle);
2878 minor = tc_get_minor(handle);
2879 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2880 (*cb)(minor - 1, &stats, aux);
2885 static const struct tc_ops tc_ops_htb = {
2886 "htb", /* linux_name */
2887 "linux-htb", /* ovs_name */
2888 HTB_N_QUEUES, /* n_queues */
2897 htb_class_get_stats,
2898 htb_class_dump_stats
2901 /* "linux-hfsc" traffic control class. */
2903 #define HFSC_N_QUEUES 0xf000
2911 struct tc_queue tc_queue;
2916 static struct hfsc *
2917 hfsc_get__(const struct netdev *netdev)
2919 struct netdev_dev_linux *netdev_dev;
2920 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2921 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2924 static struct hfsc_class *
2925 hfsc_class_cast__(const struct tc_queue *queue)
2927 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2931 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2933 struct netdev_dev_linux * netdev_dev;
2936 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2937 hfsc = xmalloc(sizeof *hfsc);
2938 tc_init(&hfsc->tc, &tc_ops_hfsc);
2939 hfsc->max_rate = max_rate;
2940 netdev_dev->tc = &hfsc->tc;
2944 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2945 const struct hfsc_class *hc)
2949 struct hfsc_class *hcp;
2950 struct tc_queue *queue;
2952 hfsc = hfsc_get__(netdev);
2953 hash = hash_int(queue_id, 0);
2955 queue = tc_find_queue__(netdev, queue_id, hash);
2957 hcp = hfsc_class_cast__(queue);
2959 hcp = xmalloc(sizeof *hcp);
2960 queue = &hcp->tc_queue;
2961 queue->queue_id = queue_id;
2962 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2965 hcp->min_rate = hc->min_rate;
2966 hcp->max_rate = hc->max_rate;
2970 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2972 const struct tc_service_curve *rsc, *fsc, *usc;
2973 static const struct nl_policy tca_hfsc_policy[] = {
2975 .type = NL_A_UNSPEC,
2977 .min_len = sizeof(struct tc_service_curve),
2980 .type = NL_A_UNSPEC,
2982 .min_len = sizeof(struct tc_service_curve),
2985 .type = NL_A_UNSPEC,
2987 .min_len = sizeof(struct tc_service_curve),
2990 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2992 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2993 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2994 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2998 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2999 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3000 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3002 if (rsc->m1 != 0 || rsc->d != 0 ||
3003 fsc->m1 != 0 || fsc->d != 0 ||
3004 usc->m1 != 0 || usc->d != 0) {
3005 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3006 "Non-linear service curves are not supported.");
3010 if (rsc->m2 != fsc->m2) {
3011 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3012 "Real-time service curves are not supported ");
3016 if (rsc->m2 > usc->m2) {
3017 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3018 "Min-rate service curve is greater than "
3019 "the max-rate service curve.");
3023 class->min_rate = fsc->m2;
3024 class->max_rate = usc->m2;
3029 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3030 struct hfsc_class *options,
3031 struct netdev_queue_stats *stats)
3034 unsigned int handle;
3035 struct nlattr *nl_options;
3037 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3043 unsigned int major, minor;
3045 major = tc_get_major(handle);
3046 minor = tc_get_minor(handle);
3047 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3048 *queue_id = minor - 1;
3055 error = hfsc_parse_tca_options__(nl_options, options);
3062 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3063 unsigned int parent, struct hfsc_class *options,
3064 struct netdev_queue_stats *stats)
3067 struct ofpbuf *reply;
3069 error = tc_query_class(netdev, handle, parent, &reply);
3074 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3075 ofpbuf_delete(reply);
3080 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3081 struct hfsc_class *class)
3084 const char *max_rate_s;
3086 max_rate_s = shash_find_data(details, "max-rate");
3087 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3092 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3093 max_rate = netdev_features_to_bps(current) / 8;
3096 class->min_rate = max_rate;
3097 class->max_rate = max_rate;
3101 hfsc_parse_class_details__(struct netdev *netdev,
3102 const struct shash *details,
3103 struct hfsc_class * class)
3105 const struct hfsc *hfsc;
3106 uint32_t min_rate, max_rate;
3107 const char *min_rate_s, *max_rate_s;
3109 hfsc = hfsc_get__(netdev);
3110 min_rate_s = shash_find_data(details, "min-rate");
3111 max_rate_s = shash_find_data(details, "max-rate");
3113 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3114 min_rate = MAX(min_rate, 1);
3115 min_rate = MIN(min_rate, hfsc->max_rate);
3117 max_rate = (max_rate_s
3118 ? strtoull(max_rate_s, NULL, 10) / 8
3120 max_rate = MAX(max_rate, min_rate);
3121 max_rate = MIN(max_rate, hfsc->max_rate);
3123 class->min_rate = min_rate;
3124 class->max_rate = max_rate;
3129 /* Create an HFSC qdisc.
3131 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3133 hfsc_setup_qdisc__(struct netdev * netdev)
3135 struct tcmsg *tcmsg;
3136 struct ofpbuf request;
3137 struct tc_hfsc_qopt opt;
3139 tc_del_qdisc(netdev);
3141 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3142 NLM_F_EXCL | NLM_F_CREATE, &request);
3148 tcmsg->tcm_handle = tc_make_handle(1, 0);
3149 tcmsg->tcm_parent = TC_H_ROOT;
3151 memset(&opt, 0, sizeof opt);
3154 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3155 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3157 return tc_transact(&request, NULL);
3160 /* Create an HFSC class.
3162 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3163 * sc rate <min_rate> ul rate <max_rate>" */
3165 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3166 unsigned int parent, struct hfsc_class *class)
3170 struct tcmsg *tcmsg;
3171 struct ofpbuf request;
3172 struct tc_service_curve min, max;
3174 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3180 tcmsg->tcm_handle = handle;
3181 tcmsg->tcm_parent = parent;
3185 min.m2 = class->min_rate;
3189 max.m2 = class->max_rate;
3191 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3192 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3193 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3194 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3195 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3196 nl_msg_end_nested(&request, opt_offset);
3198 error = tc_transact(&request, NULL);
3200 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3201 "min-rate %ubps, max-rate %ubps (%s)",
3202 netdev_get_name(netdev),
3203 tc_get_major(handle), tc_get_minor(handle),
3204 tc_get_major(parent), tc_get_minor(parent),
3205 class->min_rate, class->max_rate, strerror(error));
3212 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3215 struct hfsc_class class;
3217 error = hfsc_setup_qdisc__(netdev);
3223 hfsc_parse_qdisc_details__(netdev, details, &class);
3224 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3225 tc_make_handle(1, 0), &class);
3231 hfsc_install__(netdev, class.max_rate);
3236 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3239 struct nl_dump dump;
3240 struct hfsc_class hc;
3243 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3244 hfsc_install__(netdev, hc.max_rate);
3246 if (!start_queue_dump(netdev, &dump)) {
3250 while (nl_dump_next(&dump, &msg)) {
3251 unsigned int queue_id;
3253 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3254 hfsc_update_queue__(netdev, queue_id, &hc);
3258 nl_dump_done(&dump);
3263 hfsc_tc_destroy(struct tc *tc)
3266 struct hfsc_class *hc, *next;
3268 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3270 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3271 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3280 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3282 const struct hfsc *hfsc;
3283 hfsc = hfsc_get__(netdev);
3284 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3289 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3292 struct hfsc_class class;
3294 hfsc_parse_qdisc_details__(netdev, details, &class);
3295 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3296 tc_make_handle(1, 0), &class);
3299 hfsc_get__(netdev)->max_rate = class.max_rate;
3306 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3307 const struct tc_queue *queue, struct shash *details)
3309 const struct hfsc_class *hc;
3311 hc = hfsc_class_cast__(queue);
3312 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3313 if (hc->min_rate != hc->max_rate) {
3314 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3320 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3321 const struct shash *details)
3324 struct hfsc_class class;
3326 error = hfsc_parse_class_details__(netdev, details, &class);
3331 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3332 tc_make_handle(1, 0xfffe), &class);
3337 hfsc_update_queue__(netdev, queue_id, &class);
3342 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3346 struct hfsc_class *hc;
3348 hc = hfsc_class_cast__(queue);
3349 hfsc = hfsc_get__(netdev);
3351 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3353 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3360 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3361 struct netdev_queue_stats *stats)
3363 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3364 tc_make_handle(1, 0xfffe), NULL, stats);
3368 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3369 const struct ofpbuf *nlmsg,
3370 netdev_dump_queue_stats_cb *cb, void *aux)
3372 struct netdev_queue_stats stats;
3373 unsigned int handle, major, minor;
3376 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3381 major = tc_get_major(handle);
3382 minor = tc_get_minor(handle);
3383 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3384 (*cb)(minor - 1, &stats, aux);
3389 static const struct tc_ops tc_ops_hfsc = {
3390 "hfsc", /* linux_name */
3391 "linux-hfsc", /* ovs_name */
3392 HFSC_N_QUEUES, /* n_queues */
3393 hfsc_tc_install, /* tc_install */
3394 hfsc_tc_load, /* tc_load */
3395 hfsc_tc_destroy, /* tc_destroy */
3396 hfsc_qdisc_get, /* qdisc_get */
3397 hfsc_qdisc_set, /* qdisc_set */
3398 hfsc_class_get, /* class_get */
3399 hfsc_class_set, /* class_set */
3400 hfsc_class_delete, /* class_delete */
3401 hfsc_class_get_stats, /* class_get_stats */
3402 hfsc_class_dump_stats /* class_dump_stats */
3405 /* "linux-default" traffic control class.
3407 * This class represents the default, unnamed Linux qdisc. It corresponds to
3408 * the "" (empty string) QoS type in the OVS database. */
3411 default_install__(struct netdev *netdev)
3413 struct netdev_dev_linux *netdev_dev =
3414 netdev_dev_linux_cast(netdev_get_dev(netdev));
3415 static struct tc *tc;
3418 tc = xmalloc(sizeof *tc);
3419 tc_init(tc, &tc_ops_default);
3421 netdev_dev->tc = tc;
3425 default_tc_install(struct netdev *netdev,
3426 const struct shash *details OVS_UNUSED)
3428 default_install__(netdev);
3433 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3435 default_install__(netdev);
3439 static const struct tc_ops tc_ops_default = {
3440 NULL, /* linux_name */
3445 NULL, /* tc_destroy */
3446 NULL, /* qdisc_get */
3447 NULL, /* qdisc_set */
3448 NULL, /* class_get */
3449 NULL, /* class_set */
3450 NULL, /* class_delete */
3451 NULL, /* class_get_stats */
3452 NULL /* class_dump_stats */
3455 /* "linux-other" traffic control class.
3460 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3462 struct netdev_dev_linux *netdev_dev =
3463 netdev_dev_linux_cast(netdev_get_dev(netdev));
3464 static struct tc *tc;
3467 tc = xmalloc(sizeof *tc);
3468 tc_init(tc, &tc_ops_other);
3470 netdev_dev->tc = tc;
3474 static const struct tc_ops tc_ops_other = {
3475 NULL, /* linux_name */
3476 "linux-other", /* ovs_name */
3478 NULL, /* tc_install */
3480 NULL, /* tc_destroy */
3481 NULL, /* qdisc_get */
3482 NULL, /* qdisc_set */
3483 NULL, /* class_get */
3484 NULL, /* class_set */
3485 NULL, /* class_delete */
3486 NULL, /* class_get_stats */
3487 NULL /* class_dump_stats */
3490 /* Traffic control. */
3492 /* Number of kernel "tc" ticks per second. */
3493 static double ticks_per_s;
3495 /* Number of kernel "jiffies" per second. This is used for the purpose of
3496 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3497 * one jiffy's worth of data.
3499 * There are two possibilities here:
3501 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3502 * approximate range of 100 to 1024. That means that we really need to
3503 * make sure that the qdisc can buffer that much data.
3505 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3506 * has finely granular timers and there's no need to fudge additional room
3507 * for buffers. (There's no extra effort needed to implement that: the
3508 * large 'buffer_hz' is used as a divisor, so practically any number will
3509 * come out as 0 in the division. Small integer results in the case of
3510 * really high dividends won't have any real effect anyhow.)
3512 static unsigned int buffer_hz;
3514 /* Returns tc handle 'major':'minor'. */
3516 tc_make_handle(unsigned int major, unsigned int minor)
3518 return TC_H_MAKE(major << 16, minor);
3521 /* Returns the major number from 'handle'. */
3523 tc_get_major(unsigned int handle)
3525 return TC_H_MAJ(handle) >> 16;
3528 /* Returns the minor number from 'handle'. */
3530 tc_get_minor(unsigned int handle)
3532 return TC_H_MIN(handle);
3535 static struct tcmsg *
3536 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3537 struct ofpbuf *request)
3539 struct tcmsg *tcmsg;
3543 error = get_ifindex(netdev, &ifindex);
3548 ofpbuf_init(request, 512);
3549 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3550 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3551 tcmsg->tcm_family = AF_UNSPEC;
3552 tcmsg->tcm_ifindex = ifindex;
3553 /* Caller should fill in tcmsg->tcm_handle. */
3554 /* Caller should fill in tcmsg->tcm_parent. */
3560 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3562 int error = nl_sock_transact(rtnl_sock, request, replyp);
3563 ofpbuf_uninit(request);
3567 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3568 * policing configuration.
3570 * This function is equivalent to running the following when 'add' is true:
3571 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3573 * This function is equivalent to running the following when 'add' is false:
3574 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3576 * The configuration and stats may be seen with the following command:
3577 * /sbin/tc -s qdisc show dev <devname>
3579 * Returns 0 if successful, otherwise a positive errno value.
3582 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3584 struct ofpbuf request;
3585 struct tcmsg *tcmsg;
3587 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3588 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3590 tcmsg = tc_make_request(netdev, type, flags, &request);
3594 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3595 tcmsg->tcm_parent = TC_H_INGRESS;
3596 nl_msg_put_string(&request, TCA_KIND, "ingress");
3597 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3599 error = tc_transact(&request, NULL);
3601 /* If we're deleting the qdisc, don't worry about some of the
3602 * error conditions. */
3603 if (!add && (error == ENOENT || error == EINVAL)) {
3612 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3615 * This function is equivalent to running:
3616 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3617 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3620 * The configuration and stats may be seen with the following command:
3621 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3623 * Returns 0 if successful, otherwise a positive errno value.
3626 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3628 struct tc_police tc_police;
3629 struct ofpbuf request;
3630 struct tcmsg *tcmsg;
3631 size_t basic_offset;
3632 size_t police_offset;
3636 memset(&tc_police, 0, sizeof tc_police);
3637 tc_police.action = TC_POLICE_SHOT;
3638 tc_police.mtu = mtu;
3639 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3640 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3641 kbits_burst * 1024);
3643 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3644 NLM_F_EXCL | NLM_F_CREATE, &request);
3648 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3649 tcmsg->tcm_info = tc_make_handle(49,
3650 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3652 nl_msg_put_string(&request, TCA_KIND, "basic");
3653 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3654 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3655 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3656 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3657 nl_msg_end_nested(&request, police_offset);
3658 nl_msg_end_nested(&request, basic_offset);
3660 error = tc_transact(&request, NULL);
3671 /* The values in psched are not individually very meaningful, but they are
3672 * important. The tables below show some values seen in the wild.
3676 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3677 * (Before that, there are hints that it was 1000000000.)
3679 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3683 * -----------------------------------
3684 * [1] 000c8000 000f4240 000f4240 00000064
3685 * [2] 000003e8 00000400 000f4240 3b9aca00
3686 * [3] 000003e8 00000400 000f4240 3b9aca00
3687 * [4] 000003e8 00000400 000f4240 00000064
3688 * [5] 000003e8 00000040 000f4240 3b9aca00
3689 * [6] 000003e8 00000040 000f4240 000000f9
3691 * a b c d ticks_per_s buffer_hz
3692 * ------- --------- ---------- ------------- ----------- -------------
3693 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3694 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3695 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3696 * [4] 1,000 1,024 1,000,000 100 976,562 100
3697 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3698 * [6] 1,000 64 1,000,000 249 15,625,000 249
3700 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3701 * [2] 2.6.26-1-686-bigmem from Debian lenny
3702 * [3] 2.6.26-2-sparc64 from Debian lenny
3703 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3704 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3705 * [6] 2.6.34 from kernel.org on KVM
3707 static const char fn[] = "/proc/net/psched";
3708 unsigned int a, b, c, d;
3714 stream = fopen(fn, "r");
3716 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3720 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3721 VLOG_WARN("%s: read failed", fn);
3725 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3729 VLOG_WARN("%s: invalid scheduler parameters", fn);
3733 ticks_per_s = (double) a * c / b;
3737 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3740 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3743 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3744 * rate of 'rate' bytes per second. */
3746 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3751 return (rate * ticks) / ticks_per_s;
3754 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3755 * rate of 'rate' bytes per second. */
3757 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3762 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3765 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3766 * a transmission rate of 'rate' bytes per second. */
3768 tc_buffer_per_jiffy(unsigned int rate)
3773 return rate / buffer_hz;
3776 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3777 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3778 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3779 * stores NULL into it if it is absent.
3781 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3784 * Returns 0 if successful, otherwise a positive errno value. */
3786 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3787 struct nlattr **options)
3789 static const struct nl_policy tca_policy[] = {
3790 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3791 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3793 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3795 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3796 tca_policy, ta, ARRAY_SIZE(ta))) {
3797 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3802 *kind = nl_attr_get_string(ta[TCA_KIND]);
3806 *options = ta[TCA_OPTIONS];
3821 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3822 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3823 * into '*options', and its queue statistics into '*stats'. Any of the output
3824 * arguments may be null.
3826 * Returns 0 if successful, otherwise a positive errno value. */
3828 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3829 struct nlattr **options, struct netdev_queue_stats *stats)
3831 static const struct nl_policy tca_policy[] = {
3832 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3833 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3835 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3837 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3838 tca_policy, ta, ARRAY_SIZE(ta))) {
3839 VLOG_WARN_RL(&rl, "failed to parse class message");
3844 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3845 *handlep = tc->tcm_handle;
3849 *options = ta[TCA_OPTIONS];
3853 const struct gnet_stats_queue *gsq;
3854 struct gnet_stats_basic gsb;
3856 static const struct nl_policy stats_policy[] = {
3857 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3858 .min_len = sizeof gsb },
3859 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3860 .min_len = sizeof *gsq },
3862 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3864 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3865 sa, ARRAY_SIZE(sa))) {
3866 VLOG_WARN_RL(&rl, "failed to parse class stats");
3870 /* Alignment issues screw up the length of struct gnet_stats_basic on
3871 * some arch/bitsize combinations. Newer versions of Linux have a
3872 * struct gnet_stats_basic_packed, but we can't depend on that. The
3873 * easiest thing to do is just to make a copy. */
3874 memset(&gsb, 0, sizeof gsb);
3875 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3876 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3877 stats->tx_bytes = gsb.bytes;
3878 stats->tx_packets = gsb.packets;
3880 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3881 stats->tx_errors = gsq->drops;
3891 memset(stats, 0, sizeof *stats);
3896 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3899 tc_query_class(const struct netdev *netdev,
3900 unsigned int handle, unsigned int parent,
3901 struct ofpbuf **replyp)
3903 struct ofpbuf request;
3904 struct tcmsg *tcmsg;
3907 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3911 tcmsg->tcm_handle = handle;
3912 tcmsg->tcm_parent = parent;
3914 error = tc_transact(&request, replyp);
3916 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3917 netdev_get_name(netdev),
3918 tc_get_major(handle), tc_get_minor(handle),
3919 tc_get_major(parent), tc_get_minor(parent),
3925 /* Equivalent to "tc class del dev <name> handle <handle>". */
3927 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3929 struct ofpbuf request;
3930 struct tcmsg *tcmsg;
3933 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3937 tcmsg->tcm_handle = handle;
3938 tcmsg->tcm_parent = 0;
3940 error = tc_transact(&request, NULL);
3942 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3943 netdev_get_name(netdev),
3944 tc_get_major(handle), tc_get_minor(handle),
3950 /* Equivalent to "tc qdisc del dev <name> root". */
3952 tc_del_qdisc(struct netdev *netdev)
3954 struct netdev_dev_linux *netdev_dev =
3955 netdev_dev_linux_cast(netdev_get_dev(netdev));
3956 struct ofpbuf request;
3957 struct tcmsg *tcmsg;
3960 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3964 tcmsg->tcm_handle = tc_make_handle(1, 0);
3965 tcmsg->tcm_parent = TC_H_ROOT;
3967 error = tc_transact(&request, NULL);
3968 if (error == EINVAL) {
3969 /* EINVAL probably means that the default qdisc was in use, in which
3970 * case we've accomplished our purpose. */
3973 if (!error && netdev_dev->tc) {
3974 if (netdev_dev->tc->ops->tc_destroy) {
3975 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3977 netdev_dev->tc = NULL;
3982 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3983 * kernel to determine what they are. Returns 0 if successful, otherwise a
3984 * positive errno value. */
3986 tc_query_qdisc(const struct netdev *netdev)
3988 struct netdev_dev_linux *netdev_dev =
3989 netdev_dev_linux_cast(netdev_get_dev(netdev));
3990 struct ofpbuf request, *qdisc;
3991 const struct tc_ops *ops;
3992 struct tcmsg *tcmsg;
3996 if (netdev_dev->tc) {
4000 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4001 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4002 * 2.6.35 without that fix backported to it.
4004 * To avoid the OOPS, we must not make a request that would attempt to dump
4005 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4006 * few others. There are a few ways that I can see to do this, but most of
4007 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4008 * technique chosen here is to assume that any non-default qdisc that we
4009 * create will have a class with handle 1:0. The built-in qdiscs only have
4010 * a class with handle 0:0.
4012 * We could check for Linux 2.6.35+ and use a more straightforward method
4014 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
4018 tcmsg->tcm_handle = tc_make_handle(1, 0);
4019 tcmsg->tcm_parent = 0;
4021 /* Figure out what tc class to instantiate. */
4022 error = tc_transact(&request, &qdisc);
4026 error = tc_parse_qdisc(qdisc, &kind, NULL);
4028 ops = &tc_ops_other;
4030 ops = tc_lookup_linux_name(kind);
4032 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4033 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4035 ops = &tc_ops_other;
4038 } else if (error == ENOENT) {
4039 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4040 * other entity that doesn't have a handle 1:0. We will assume
4041 * that it's the system default qdisc. */
4042 ops = &tc_ops_default;
4045 /* Who knows? Maybe the device got deleted. */
4046 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4047 netdev_get_name(netdev), strerror(error));
4048 ops = &tc_ops_other;
4051 /* Instantiate it. */
4052 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4053 assert((load_error == 0) == (netdev_dev->tc != NULL));
4054 ofpbuf_delete(qdisc);
4056 return error ? error : load_error;
4059 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4060 approximate the time to transmit packets of various lengths. For an MTU of
4061 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4062 represents two possible packet lengths; for a MTU of 513 through 1024, four
4063 possible lengths; and so on.
4065 Returns, for the specified 'mtu', the number of bits that packet lengths
4066 need to be shifted right to fit within such a 256-entry table. */
4068 tc_calc_cell_log(unsigned int mtu)
4073 mtu = ETH_PAYLOAD_MAX;
4075 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4077 for (cell_log = 0; mtu >= 256; cell_log++) {
4084 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4087 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4089 memset(rate, 0, sizeof *rate);
4090 rate->cell_log = tc_calc_cell_log(mtu);
4091 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4092 /* rate->cell_align = 0; */ /* distro headers. */
4093 rate->mpu = ETH_TOTAL_MIN;
4097 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4098 * attribute of the specified "type".
4100 * See tc_calc_cell_log() above for a description of "rtab"s. */
4102 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4107 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4108 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4109 unsigned packet_size = (i + 1) << rate->cell_log;
4110 if (packet_size < rate->mpu) {
4111 packet_size = rate->mpu;
4113 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4117 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4118 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4119 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4122 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4124 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4125 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4128 /* Linux-only functions declared in netdev-linux.h */
4130 /* Returns a fd for an AF_INET socket or a negative errno value. */
4132 netdev_linux_get_af_inet_sock(void)
4134 int error = netdev_linux_init();
4135 return error ? -error : af_inet_sock;
4138 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4139 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4141 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4142 const char *flag_name, bool enable)
4144 const char *netdev_name = netdev_get_name(netdev);
4145 struct ethtool_value evalue;
4149 memset(&evalue, 0, sizeof evalue);
4150 error = netdev_linux_do_ethtool(netdev_name,
4151 (struct ethtool_cmd *)&evalue,
4152 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4157 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4158 error = netdev_linux_do_ethtool(netdev_name,
4159 (struct ethtool_cmd *)&evalue,
4160 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4165 memset(&evalue, 0, sizeof evalue);
4166 error = netdev_linux_do_ethtool(netdev_name,
4167 (struct ethtool_cmd *)&evalue,
4168 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4173 if (new_flags != evalue.data) {
4174 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4175 "device %s failed", enable ? "enable" : "disable",
4176 flag_name, netdev_name);
4183 /* Utility functions. */
4185 /* Copies 'src' into 'dst', performing format conversion in the process. */
4187 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4188 const struct rtnl_link_stats *src)
4190 dst->rx_packets = src->rx_packets;
4191 dst->tx_packets = src->tx_packets;
4192 dst->rx_bytes = src->rx_bytes;
4193 dst->tx_bytes = src->tx_bytes;
4194 dst->rx_errors = src->rx_errors;
4195 dst->tx_errors = src->tx_errors;
4196 dst->rx_dropped = src->rx_dropped;
4197 dst->tx_dropped = src->tx_dropped;
4198 dst->multicast = src->multicast;
4199 dst->collisions = src->collisions;
4200 dst->rx_length_errors = src->rx_length_errors;
4201 dst->rx_over_errors = src->rx_over_errors;
4202 dst->rx_crc_errors = src->rx_crc_errors;
4203 dst->rx_frame_errors = src->rx_frame_errors;
4204 dst->rx_fifo_errors = src->rx_fifo_errors;
4205 dst->rx_missed_errors = src->rx_missed_errors;
4206 dst->tx_aborted_errors = src->tx_aborted_errors;
4207 dst->tx_carrier_errors = src->tx_carrier_errors;
4208 dst->tx_fifo_errors = src->tx_fifo_errors;
4209 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4210 dst->tx_window_errors = src->tx_window_errors;
4214 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4216 /* Policy for RTNLGRP_LINK messages.
4218 * There are *many* more fields in these messages, but currently we only
4219 * care about these fields. */
4220 static const struct nl_policy rtnlgrp_link_policy[] = {
4221 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4222 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4223 .min_len = sizeof(struct rtnl_link_stats) },
4226 struct ofpbuf request;
4227 struct ofpbuf *reply;
4228 struct ifinfomsg *ifi;
4229 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4232 ofpbuf_init(&request, 0);
4233 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4234 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4235 ifi->ifi_family = PF_UNSPEC;
4236 ifi->ifi_index = ifindex;
4237 error = nl_sock_transact(rtnl_sock, &request, &reply);
4238 ofpbuf_uninit(&request);
4243 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4244 rtnlgrp_link_policy,
4245 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4246 ofpbuf_delete(reply);
4250 if (!attrs[IFLA_STATS]) {
4251 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4252 ofpbuf_delete(reply);
4256 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4258 ofpbuf_delete(reply);
4264 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4266 static const char fn[] = "/proc/net/dev";
4271 stream = fopen(fn, "r");
4273 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4278 while (fgets(line, sizeof line, stream)) {
4281 #define X64 "%"SCNu64
4284 X64 X64 X64 X64 X64 X64 X64 "%*u"
4285 X64 X64 X64 X64 X64 X64 X64 "%*u",
4291 &stats->rx_fifo_errors,
4292 &stats->rx_frame_errors,
4298 &stats->tx_fifo_errors,
4300 &stats->tx_carrier_errors) != 15) {
4301 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4302 } else if (!strcmp(devname, netdev_name)) {
4303 stats->rx_length_errors = UINT64_MAX;
4304 stats->rx_over_errors = UINT64_MAX;
4305 stats->rx_crc_errors = UINT64_MAX;
4306 stats->rx_missed_errors = UINT64_MAX;
4307 stats->tx_aborted_errors = UINT64_MAX;
4308 stats->tx_heartbeat_errors = UINT64_MAX;
4309 stats->tx_window_errors = UINT64_MAX;
4315 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4321 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4327 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4330 *flags = ifr.ifr_flags;
4336 set_flags(struct netdev *netdev, unsigned int flags)
4340 ifr.ifr_flags = flags;
4341 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4346 do_get_ifindex(const char *netdev_name)
4350 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4351 COVERAGE_INC(netdev_get_ifindex);
4352 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4353 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4354 netdev_name, strerror(errno));
4357 return ifr.ifr_ifindex;
4361 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4363 struct netdev_dev_linux *netdev_dev =
4364 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4366 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4367 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4371 netdev_dev->cache_valid |= VALID_IFINDEX;
4372 netdev_dev->ifindex = ifindex;
4374 *ifindexp = netdev_dev->ifindex;
4379 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4384 memset(&ifr, 0, sizeof ifr);
4385 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4386 COVERAGE_INC(netdev_get_hwaddr);
4387 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4388 /* ENODEV probably means that a vif disappeared asynchronously and
4389 * hasn't been removed from the database yet, so reduce the log level
4390 * to INFO for that case. */
4391 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4392 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4393 netdev_name, strerror(errno));
4396 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4397 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4398 VLOG_WARN("%s device has unknown hardware address family %d",
4399 netdev_name, hwaddr_family);
4401 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4406 set_etheraddr(const char *netdev_name,
4407 const uint8_t mac[ETH_ADDR_LEN])
4411 memset(&ifr, 0, sizeof ifr);
4412 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4413 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
4414 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4415 COVERAGE_INC(netdev_set_hwaddr);
4416 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4417 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4418 netdev_name, strerror(errno));
4425 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4426 int cmd, const char *cmd_name)
4430 memset(&ifr, 0, sizeof ifr);
4431 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4432 ifr.ifr_data = (caddr_t) ecmd;
4435 COVERAGE_INC(netdev_ethtool);
4436 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4439 if (errno != EOPNOTSUPP) {
4440 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4441 "failed: %s", cmd_name, name, strerror(errno));
4443 /* The device doesn't support this operation. That's pretty
4444 * common, so there's no point in logging anything. */
4451 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4452 const char *cmd_name)
4454 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4455 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4456 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4464 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4465 int cmd, const char *cmd_name)
4470 ifr.ifr_addr.sa_family = AF_INET;
4471 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4473 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4474 *ip = sin->sin_addr;
4479 /* Returns an AF_PACKET raw socket or a negative errno value. */
4481 af_packet_sock(void)
4483 static int sock = INT_MIN;
4485 if (sock == INT_MIN) {
4486 sock = socket(AF_PACKET, SOCK_RAW, 0);
4488 set_nonblocking(sock);
4491 VLOG_ERR("failed to create packet socket: %s", strerror(errno));