2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_VPORT_STAT_ERROR = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
371 unsigned int ifi_flags;
372 long long int carrier_resets;
373 uint32_t kbits_rate; /* Policing data. */
374 uint32_t kbits_burst;
375 int vport_stats_error; /* Cached error code from vport_get_stats().
376 0 or an errno value. */
380 struct tap_state tap;
384 struct netdev_linux {
385 struct netdev netdev;
389 /* Sockets used for ioctl operations. */
390 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
392 /* A Netlink routing socket that is not subscribed to any multicast groups. */
393 static struct nl_sock *rtnl_sock;
395 /* This is set pretty low because we probably won't learn anything from the
396 * additional log messages. */
397 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
399 static int netdev_linux_init(void);
401 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
402 int cmd, const char *cmd_name);
403 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
404 const char *cmd_name);
405 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
406 int cmd, const char *cmd_name);
407 static int get_flags(const struct netdev_dev *, unsigned int *flags);
408 static int set_flags(struct netdev *, unsigned int flags);
409 static int do_get_ifindex(const char *netdev_name);
410 static int get_ifindex(const struct netdev *, int *ifindexp);
411 static int do_set_addr(struct netdev *netdev,
412 int ioctl_nr, const char *ioctl_name,
413 struct in_addr addr);
414 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
415 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
416 const uint8_t[ETH_ADDR_LEN]);
417 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
418 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_wait();
483 netdev_linux_miimon_wait();
487 netdev_dev_linux_changed(struct netdev_dev_linux *dev, unsigned int ifi_flags)
490 if (!dev->change_seq) {
494 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
495 dev->carrier_resets++;
497 dev->ifi_flags = ifi_flags;
499 dev->cache_valid = 0;
503 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
504 void *aux OVS_UNUSED)
506 struct netdev_dev_linux *dev;
508 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
510 const struct netdev_class *netdev_class =
511 netdev_dev_get_class(base_dev);
513 if (is_netdev_linux_class(netdev_class)) {
514 dev = netdev_dev_linux_cast(base_dev);
515 netdev_dev_linux_changed(dev, change->ifi_flags);
519 struct shash device_shash;
520 struct shash_node *node;
522 shash_init(&device_shash);
523 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
524 SHASH_FOR_EACH (node, &device_shash) {
529 get_flags(&dev->netdev_dev, &flags);
530 netdev_dev_linux_changed(dev, flags);
532 shash_destroy(&device_shash);
537 cache_notifier_ref(void)
539 if (!cache_notifier_refcount) {
540 assert(!netdev_linux_cache_notifier);
542 netdev_linux_cache_notifier =
543 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
545 if (!netdev_linux_cache_notifier) {
549 cache_notifier_refcount++;
555 cache_notifier_unref(void)
557 assert(cache_notifier_refcount > 0);
558 if (!--cache_notifier_refcount) {
559 assert(netdev_linux_cache_notifier);
560 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
561 netdev_linux_cache_notifier = NULL;
565 /* Creates system and internal devices. */
567 netdev_linux_create(const struct netdev_class *class, const char *name,
568 struct netdev_dev **netdev_devp)
570 struct netdev_dev_linux *netdev_dev;
573 error = cache_notifier_ref();
578 netdev_dev = xzalloc(sizeof *netdev_dev);
579 netdev_dev->change_seq = 1;
580 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
581 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
583 *netdev_devp = &netdev_dev->netdev_dev;
587 /* For most types of netdevs we open the device for each call of
588 * netdev_open(). However, this is not the case with tap devices,
589 * since it is only possible to open the device once. In this
590 * situation we share a single file descriptor, and consequently
591 * buffers, across all readers. Therefore once data is read it will
592 * be unavailable to other reads for tap devices. */
594 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
595 const char *name, struct netdev_dev **netdev_devp)
597 struct netdev_dev_linux *netdev_dev;
598 struct tap_state *state;
599 static const char tap_dev[] = "/dev/net/tun";
603 netdev_dev = xzalloc(sizeof *netdev_dev);
604 state = &netdev_dev->state.tap;
606 error = cache_notifier_ref();
611 /* Open tap device. */
612 state->fd = open(tap_dev, O_RDWR);
615 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
616 goto error_unref_notifier;
619 /* Create tap device. */
620 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
621 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
622 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
623 VLOG_WARN("%s: creating tap device failed: %s", name,
626 goto error_unref_notifier;
629 /* Make non-blocking. */
630 error = set_nonblocking(state->fd);
632 goto error_unref_notifier;
635 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
636 *netdev_devp = &netdev_dev->netdev_dev;
639 error_unref_notifier:
640 cache_notifier_unref();
647 destroy_tap(struct netdev_dev_linux *netdev_dev)
649 struct tap_state *state = &netdev_dev->state.tap;
651 if (state->fd >= 0) {
656 /* Destroys the netdev device 'netdev_dev_'. */
658 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
660 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
661 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
663 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
664 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
667 if (class == &netdev_tap_class) {
668 destroy_tap(netdev_dev);
672 cache_notifier_unref();
676 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
678 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
679 struct netdev_linux *netdev;
680 enum netdev_flags flags;
683 /* Allocate network device. */
684 netdev = xzalloc(sizeof *netdev);
686 netdev_init(&netdev->netdev, netdev_dev_);
688 /* Verify that the device really exists, by attempting to read its flags.
689 * (The flags might be cached, in which case this won't actually do an
692 * Don't do this for "internal" netdevs, though, because those have to be
693 * created as netdev objects before they exist in the kernel, because
694 * creating them in the kernel happens by passing a netdev object to
695 * dpif_port_add(). */
696 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
697 error = netdev_get_flags(&netdev->netdev, &flags);
698 if (error == ENODEV) {
703 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
704 !netdev_dev->state.tap.opened) {
706 /* We assume that the first user of the tap device is the primary user
707 * and give them the tap FD. Subsequent users probably just expect
708 * this to be a system device so open it normally to avoid send/receive
709 * directions appearing to be reversed. */
710 netdev->fd = netdev_dev->state.tap.fd;
711 netdev_dev->state.tap.opened = true;
714 *netdevp = &netdev->netdev;
718 netdev_uninit(&netdev->netdev, true);
722 /* Closes and destroys 'netdev'. */
724 netdev_linux_close(struct netdev *netdev_)
726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
728 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
735 netdev_linux_listen(struct netdev *netdev_)
737 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
738 struct sockaddr_ll sll;
743 if (netdev->fd >= 0) {
747 /* Create file descriptor. */
748 fd = socket(PF_PACKET, SOCK_RAW, 0);
751 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
755 /* Set non-blocking mode. */
756 error = set_nonblocking(fd);
761 /* Get ethernet device index. */
762 error = get_ifindex(&netdev->netdev, &ifindex);
767 /* Bind to specific ethernet device. */
768 memset(&sll, 0, sizeof sll);
769 sll.sll_family = AF_PACKET;
770 sll.sll_ifindex = ifindex;
771 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
772 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
774 VLOG_ERR("%s: failed to bind raw socket (%s)",
775 netdev_get_name(netdev_), strerror(error));
790 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
792 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
794 if (netdev->fd < 0) {
795 /* Device is not listening. */
800 ssize_t retval = recv(netdev->fd, data, size, MSG_TRUNC);
802 return retval <= size ? retval : -EMSGSIZE;
803 } else if (errno != EINTR) {
804 if (errno != EAGAIN) {
805 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
806 strerror(errno), netdev_get_name(netdev_));
813 /* Registers with the poll loop to wake up from the next call to poll_block()
814 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
816 netdev_linux_recv_wait(struct netdev *netdev_)
818 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
819 if (netdev->fd >= 0) {
820 poll_fd_wait(netdev->fd, POLLIN);
824 /* Discards all packets waiting to be received from 'netdev'. */
826 netdev_linux_drain(struct netdev *netdev_)
828 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
829 if (netdev->fd < 0) {
831 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
833 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
834 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
838 drain_fd(netdev->fd, ifr.ifr_qlen);
841 return drain_rcvbuf(netdev->fd);
845 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
846 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
847 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
848 * the packet is too big or too small to transmit on the device.
850 * The caller retains ownership of 'buffer' in all cases.
852 * The kernel maintains a packet transmission queue, so the caller is not
853 * expected to do additional queuing of packets. */
855 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
857 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
861 if (netdev->fd < 0) {
862 /* Use our AF_PACKET socket to send to this device. */
863 struct sockaddr_ll sll;
870 sock = af_packet_sock();
875 error = get_ifindex(netdev_, &ifindex);
880 /* We don't bother setting most fields in sockaddr_ll because the
881 * kernel ignores them for SOCK_RAW. */
882 memset(&sll, 0, sizeof sll);
883 sll.sll_family = AF_PACKET;
884 sll.sll_ifindex = ifindex;
886 iov.iov_base = (void *) data;
890 msg.msg_namelen = sizeof sll;
893 msg.msg_control = NULL;
894 msg.msg_controllen = 0;
897 retval = sendmsg(sock, &msg, 0);
899 /* Use the netdev's own fd to send to this device. This is
900 * essential for tap devices, because packets sent to a tap device
901 * with an AF_PACKET socket will loop back to be *received* again
902 * on the tap device. */
903 retval = write(netdev->fd, data, size);
907 /* The Linux AF_PACKET implementation never blocks waiting for room
908 * for packets, instead returning ENOBUFS. Translate this into
909 * EAGAIN for the caller. */
910 if (errno == ENOBUFS) {
912 } else if (errno == EINTR) {
914 } else if (errno != EAGAIN) {
915 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
916 netdev_get_name(netdev_), strerror(errno));
919 } else if (retval != size) {
920 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
921 "%zu) on %s", retval, size, netdev_get_name(netdev_));
929 /* Registers with the poll loop to wake up from the next call to poll_block()
930 * when the packet transmission queue has sufficient room to transmit a packet
931 * with netdev_send().
933 * The kernel maintains a packet transmission queue, so the client is not
934 * expected to do additional queuing of packets. Thus, this function is
935 * unlikely to ever be used. It is included for completeness. */
937 netdev_linux_send_wait(struct netdev *netdev_)
939 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
940 if (netdev->fd < 0) {
942 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
943 poll_fd_wait(netdev->fd, POLLOUT);
945 /* TAP device always accepts packets.*/
946 poll_immediate_wake();
950 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
951 * otherwise a positive errno value. */
953 netdev_linux_set_etheraddr(struct netdev *netdev_,
954 const uint8_t mac[ETH_ADDR_LEN])
956 struct netdev_dev_linux *netdev_dev =
957 netdev_dev_linux_cast(netdev_get_dev(netdev_));
960 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
961 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
962 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
964 netdev_dev->cache_valid |= VALID_ETHERADDR;
965 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
973 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
974 * free the returned buffer. */
976 netdev_linux_get_etheraddr(const struct netdev *netdev_,
977 uint8_t mac[ETH_ADDR_LEN])
979 struct netdev_dev_linux *netdev_dev =
980 netdev_dev_linux_cast(netdev_get_dev(netdev_));
981 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
982 int error = get_etheraddr(netdev_get_name(netdev_),
983 netdev_dev->etheraddr);
987 netdev_dev->cache_valid |= VALID_ETHERADDR;
989 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
993 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
994 * in bytes, not including the hardware header; thus, this is typically 1500
995 * bytes for Ethernet devices. */
997 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
999 struct netdev_dev_linux *netdev_dev =
1000 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1001 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1005 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1006 SIOCGIFMTU, "SIOCGIFMTU");
1010 netdev_dev->mtu = ifr.ifr_mtu;
1011 netdev_dev->cache_valid |= VALID_MTU;
1013 *mtup = netdev_dev->mtu;
1017 /* Sets the maximum size of transmitted (MTU) for given device using linux
1018 * networking ioctl interface.
1021 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1023 struct netdev_dev_linux *netdev_dev =
1024 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1028 if (netdev_dev->cache_valid & VALID_MTU &&
1029 netdev_dev->mtu == mtu) {
1033 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1034 SIOCSIFMTU, "SIOCSIFMTU");
1039 netdev_dev->mtu = ifr.ifr_mtu;
1040 netdev_dev->cache_valid |= VALID_MTU;
1044 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1045 * On failure, returns a negative errno value. */
1047 netdev_linux_get_ifindex(const struct netdev *netdev)
1051 error = get_ifindex(netdev, &ifindex);
1052 return error ? -error : ifindex;
1056 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1058 struct netdev_dev_linux *netdev_dev =
1059 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1061 if (netdev_dev->miimon_interval > 0) {
1062 *carrier = netdev_dev->miimon;
1064 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1070 static long long int
1071 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1073 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1077 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1078 struct mii_ioctl_data *data)
1083 memset(&ifr, 0, sizeof ifr);
1084 memcpy(&ifr.ifr_data, data, sizeof *data);
1085 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1086 memcpy(data, &ifr.ifr_data, sizeof *data);
1092 netdev_linux_get_miimon(const char *name, bool *miimon)
1094 struct mii_ioctl_data data;
1099 memset(&data, 0, sizeof data);
1100 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1102 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1103 data.reg_num = MII_BMSR;
1104 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1108 *miimon = !!(data.val_out & BMSR_LSTATUS);
1110 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1113 struct ethtool_cmd ecmd;
1115 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1118 memset(&ecmd, 0, sizeof ecmd);
1119 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1122 struct ethtool_value eval;
1124 memcpy(&eval, &ecmd, sizeof eval);
1125 *miimon = !!eval.data;
1127 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1135 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1136 long long int interval)
1138 struct netdev_dev_linux *netdev_dev;
1140 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1142 interval = interval > 0 ? MAX(interval, 100) : 0;
1143 if (netdev_dev->miimon_interval != interval) {
1144 netdev_dev->miimon_interval = interval;
1145 timer_set_expired(&netdev_dev->miimon_timer);
1152 netdev_linux_miimon_run(void)
1154 struct shash device_shash;
1155 struct shash_node *node;
1157 shash_init(&device_shash);
1158 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1159 SHASH_FOR_EACH (node, &device_shash) {
1160 struct netdev_dev_linux *dev = node->data;
1163 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1167 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1168 if (miimon != dev->miimon) {
1169 dev->miimon = miimon;
1170 netdev_dev_linux_changed(dev, dev->ifi_flags);
1173 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1176 shash_destroy(&device_shash);
1180 netdev_linux_miimon_wait(void)
1182 struct shash device_shash;
1183 struct shash_node *node;
1185 shash_init(&device_shash);
1186 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1187 SHASH_FOR_EACH (node, &device_shash) {
1188 struct netdev_dev_linux *dev = node->data;
1190 if (dev->miimon_interval > 0) {
1191 timer_wait(&dev->miimon_timer);
1194 shash_destroy(&device_shash);
1197 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1198 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1201 check_for_working_netlink_stats(void)
1203 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1204 * preferable, so if that works, we'll use it. */
1205 int ifindex = do_get_ifindex("lo");
1207 VLOG_WARN("failed to get ifindex for lo, "
1208 "obtaining netdev stats from proc");
1211 struct netdev_stats stats;
1212 int error = get_stats_via_netlink(ifindex, &stats);
1214 VLOG_DBG("obtaining netdev stats via rtnetlink");
1217 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1218 "via proc (you are probably running a pre-2.6.19 "
1219 "kernel)", strerror(error));
1226 swap_uint64(uint64_t *a, uint64_t *b)
1234 get_stats_via_vport(const struct netdev *netdev_,
1235 struct netdev_stats *stats)
1237 struct netdev_dev_linux *netdev_dev =
1238 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1240 if (!netdev_dev->vport_stats_error ||
1241 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1244 error = netdev_vport_get_stats(netdev_, stats);
1246 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1247 "(%s)", netdev_get_name(netdev_), strerror(error));
1249 netdev_dev->vport_stats_error = error;
1250 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1255 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1256 struct netdev_stats *stats)
1258 static int use_netlink_stats = -1;
1261 if (use_netlink_stats < 0) {
1262 use_netlink_stats = check_for_working_netlink_stats();
1265 if (use_netlink_stats) {
1268 error = get_ifindex(netdev_, &ifindex);
1270 error = get_stats_via_netlink(ifindex, stats);
1273 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1277 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1278 netdev_get_name(netdev_), error);
1284 /* Retrieves current device stats for 'netdev-linux'. */
1286 netdev_linux_get_stats(const struct netdev *netdev_,
1287 struct netdev_stats *stats)
1289 struct netdev_dev_linux *netdev_dev =
1290 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1291 struct netdev_stats dev_stats;
1294 get_stats_via_vport(netdev_, stats);
1296 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1299 if (netdev_dev->vport_stats_error) {
1306 if (netdev_dev->vport_stats_error) {
1307 /* stats not available from OVS then use ioctl stats. */
1310 stats->rx_errors += dev_stats.rx_errors;
1311 stats->tx_errors += dev_stats.tx_errors;
1312 stats->rx_dropped += dev_stats.rx_dropped;
1313 stats->tx_dropped += dev_stats.tx_dropped;
1314 stats->multicast += dev_stats.multicast;
1315 stats->collisions += dev_stats.collisions;
1316 stats->rx_length_errors += dev_stats.rx_length_errors;
1317 stats->rx_over_errors += dev_stats.rx_over_errors;
1318 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1319 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1320 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1321 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1322 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1323 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1324 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1325 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1326 stats->tx_window_errors += dev_stats.tx_window_errors;
1331 /* Retrieves current device stats for 'netdev-tap' netdev or
1332 * netdev-internal. */
1334 netdev_tap_get_stats(const struct netdev *netdev_,
1335 struct netdev_stats *stats)
1337 struct netdev_dev_linux *netdev_dev =
1338 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1339 struct netdev_stats dev_stats;
1342 get_stats_via_vport(netdev_, stats);
1344 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1346 if (netdev_dev->vport_stats_error) {
1353 /* If this port is an internal port then the transmit and receive stats
1354 * will appear to be swapped relative to the other ports since we are the
1355 * one sending the data, not a remote computer. For consistency, we swap
1356 * them back here. This does not apply if we are getting stats from the
1357 * vport layer because it always tracks stats from the perspective of the
1359 if (netdev_dev->vport_stats_error) {
1361 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1362 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1363 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1364 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1365 stats->rx_length_errors = 0;
1366 stats->rx_over_errors = 0;
1367 stats->rx_crc_errors = 0;
1368 stats->rx_frame_errors = 0;
1369 stats->rx_fifo_errors = 0;
1370 stats->rx_missed_errors = 0;
1371 stats->tx_aborted_errors = 0;
1372 stats->tx_carrier_errors = 0;
1373 stats->tx_fifo_errors = 0;
1374 stats->tx_heartbeat_errors = 0;
1375 stats->tx_window_errors = 0;
1377 stats->rx_dropped += dev_stats.tx_dropped;
1378 stats->tx_dropped += dev_stats.rx_dropped;
1380 stats->rx_errors += dev_stats.tx_errors;
1381 stats->tx_errors += dev_stats.rx_errors;
1383 stats->multicast += dev_stats.multicast;
1384 stats->collisions += dev_stats.collisions;
1390 netdev_internal_get_stats(const struct netdev *netdev_,
1391 struct netdev_stats *stats)
1393 struct netdev_dev_linux *netdev_dev =
1394 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1396 get_stats_via_vport(netdev_, stats);
1397 return netdev_dev->vport_stats_error;
1400 /* Stores the features supported by 'netdev' into each of '*current',
1401 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1402 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1405 netdev_linux_get_features(const struct netdev *netdev,
1406 enum netdev_features *current,
1407 enum netdev_features *advertised,
1408 enum netdev_features *supported,
1409 enum netdev_features *peer)
1411 struct ethtool_cmd ecmd;
1415 memset(&ecmd, 0, sizeof ecmd);
1416 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1417 ETHTOOL_GSET, "ETHTOOL_GSET");
1422 /* Supported features. */
1424 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1425 *supported |= NETDEV_F_10MB_HD;
1427 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1428 *supported |= NETDEV_F_10MB_FD;
1430 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1431 *supported |= NETDEV_F_100MB_HD;
1433 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1434 *supported |= NETDEV_F_100MB_FD;
1436 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1437 *supported |= NETDEV_F_1GB_HD;
1439 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1440 *supported |= NETDEV_F_1GB_FD;
1442 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1443 *supported |= NETDEV_F_10GB_FD;
1445 if (ecmd.supported & SUPPORTED_TP) {
1446 *supported |= NETDEV_F_COPPER;
1448 if (ecmd.supported & SUPPORTED_FIBRE) {
1449 *supported |= NETDEV_F_FIBER;
1451 if (ecmd.supported & SUPPORTED_Autoneg) {
1452 *supported |= NETDEV_F_AUTONEG;
1454 if (ecmd.supported & SUPPORTED_Pause) {
1455 *supported |= NETDEV_F_PAUSE;
1457 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1458 *supported |= NETDEV_F_PAUSE_ASYM;
1461 /* Advertised features. */
1463 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1464 *advertised |= NETDEV_F_10MB_HD;
1466 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1467 *advertised |= NETDEV_F_10MB_FD;
1469 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1470 *advertised |= NETDEV_F_100MB_HD;
1472 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1473 *advertised |= NETDEV_F_100MB_FD;
1475 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1476 *advertised |= NETDEV_F_1GB_HD;
1478 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1479 *advertised |= NETDEV_F_1GB_FD;
1481 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1482 *advertised |= NETDEV_F_10GB_FD;
1484 if (ecmd.advertising & ADVERTISED_TP) {
1485 *advertised |= NETDEV_F_COPPER;
1487 if (ecmd.advertising & ADVERTISED_FIBRE) {
1488 *advertised |= NETDEV_F_FIBER;
1490 if (ecmd.advertising & ADVERTISED_Autoneg) {
1491 *advertised |= NETDEV_F_AUTONEG;
1493 if (ecmd.advertising & ADVERTISED_Pause) {
1494 *advertised |= NETDEV_F_PAUSE;
1496 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1497 *advertised |= NETDEV_F_PAUSE_ASYM;
1500 /* Current settings. */
1501 speed = (ecmd.speed_hi << 16) | ecmd.speed;
1502 if (speed == SPEED_10) {
1503 *current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1504 } else if (speed == SPEED_100) {
1505 *current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1506 } else if (speed == SPEED_1000) {
1507 *current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1508 } else if (speed == SPEED_10000) {
1509 *current = NETDEV_F_10GB_FD;
1510 } else if (speed == 40000) {
1511 *current = NETDEV_F_40GB_FD;
1512 } else if (speed == 100000) {
1513 *current = NETDEV_F_100GB_FD;
1514 } else if (speed == 1000000) {
1515 *current = NETDEV_F_1TB_FD;
1520 if (ecmd.port == PORT_TP) {
1521 *current |= NETDEV_F_COPPER;
1522 } else if (ecmd.port == PORT_FIBRE) {
1523 *current |= NETDEV_F_FIBER;
1527 *current |= NETDEV_F_AUTONEG;
1530 /* Peer advertisements. */
1531 *peer = 0; /* XXX */
1536 /* Set the features advertised by 'netdev' to 'advertise'. */
1538 netdev_linux_set_advertisements(struct netdev *netdev,
1539 enum netdev_features advertise)
1541 struct ethtool_cmd ecmd;
1544 memset(&ecmd, 0, sizeof ecmd);
1545 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1546 ETHTOOL_GSET, "ETHTOOL_GSET");
1551 ecmd.advertising = 0;
1552 if (advertise & NETDEV_F_10MB_HD) {
1553 ecmd.advertising |= ADVERTISED_10baseT_Half;
1555 if (advertise & NETDEV_F_10MB_FD) {
1556 ecmd.advertising |= ADVERTISED_10baseT_Full;
1558 if (advertise & NETDEV_F_100MB_HD) {
1559 ecmd.advertising |= ADVERTISED_100baseT_Half;
1561 if (advertise & NETDEV_F_100MB_FD) {
1562 ecmd.advertising |= ADVERTISED_100baseT_Full;
1564 if (advertise & NETDEV_F_1GB_HD) {
1565 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1567 if (advertise & NETDEV_F_1GB_FD) {
1568 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1570 if (advertise & NETDEV_F_10GB_FD) {
1571 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1573 if (advertise & NETDEV_F_COPPER) {
1574 ecmd.advertising |= ADVERTISED_TP;
1576 if (advertise & NETDEV_F_FIBER) {
1577 ecmd.advertising |= ADVERTISED_FIBRE;
1579 if (advertise & NETDEV_F_AUTONEG) {
1580 ecmd.advertising |= ADVERTISED_Autoneg;
1582 if (advertise & NETDEV_F_PAUSE) {
1583 ecmd.advertising |= ADVERTISED_Pause;
1585 if (advertise & NETDEV_F_PAUSE_ASYM) {
1586 ecmd.advertising |= ADVERTISED_Asym_Pause;
1588 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1589 ETHTOOL_SSET, "ETHTOOL_SSET");
1592 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1593 * successful, otherwise a positive errno value. */
1595 netdev_linux_set_policing(struct netdev *netdev,
1596 uint32_t kbits_rate, uint32_t kbits_burst)
1598 struct netdev_dev_linux *netdev_dev =
1599 netdev_dev_linux_cast(netdev_get_dev(netdev));
1600 const char *netdev_name = netdev_get_name(netdev);
1604 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1605 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1606 : kbits_burst); /* Stick with user-specified value. */
1608 if (netdev_dev->cache_valid & VALID_POLICING
1609 && netdev_dev->kbits_rate == kbits_rate
1610 && netdev_dev->kbits_burst == kbits_burst) {
1611 /* Assume that settings haven't changed since we last set them. */
1615 COVERAGE_INC(netdev_set_policing);
1616 /* Remove any existing ingress qdisc. */
1617 error = tc_add_del_ingress_qdisc(netdev, false);
1619 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1620 netdev_name, strerror(error));
1625 error = tc_add_del_ingress_qdisc(netdev, true);
1627 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1628 netdev_name, strerror(error));
1632 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1634 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1635 netdev_name, strerror(error));
1640 netdev_dev->kbits_rate = kbits_rate;
1641 netdev_dev->kbits_burst = kbits_burst;
1642 netdev_dev->cache_valid |= VALID_POLICING;
1648 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1651 const struct tc_ops **opsp;
1653 for (opsp = tcs; *opsp != NULL; opsp++) {
1654 const struct tc_ops *ops = *opsp;
1655 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1656 sset_add(types, ops->ovs_name);
1662 static const struct tc_ops *
1663 tc_lookup_ovs_name(const char *name)
1665 const struct tc_ops **opsp;
1667 for (opsp = tcs; *opsp != NULL; opsp++) {
1668 const struct tc_ops *ops = *opsp;
1669 if (!strcmp(name, ops->ovs_name)) {
1676 static const struct tc_ops *
1677 tc_lookup_linux_name(const char *name)
1679 const struct tc_ops **opsp;
1681 for (opsp = tcs; *opsp != NULL; opsp++) {
1682 const struct tc_ops *ops = *opsp;
1683 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1690 static struct tc_queue *
1691 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1694 struct netdev_dev_linux *netdev_dev =
1695 netdev_dev_linux_cast(netdev_get_dev(netdev));
1696 struct tc_queue *queue;
1698 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1699 if (queue->queue_id == queue_id) {
1706 static struct tc_queue *
1707 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1709 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1713 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1715 struct netdev_qos_capabilities *caps)
1717 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1721 caps->n_queues = ops->n_queues;
1726 netdev_linux_get_qos(const struct netdev *netdev,
1727 const char **typep, struct shash *details)
1729 struct netdev_dev_linux *netdev_dev =
1730 netdev_dev_linux_cast(netdev_get_dev(netdev));
1733 error = tc_query_qdisc(netdev);
1738 *typep = netdev_dev->tc->ops->ovs_name;
1739 return (netdev_dev->tc->ops->qdisc_get
1740 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1745 netdev_linux_set_qos(struct netdev *netdev,
1746 const char *type, const struct shash *details)
1748 struct netdev_dev_linux *netdev_dev =
1749 netdev_dev_linux_cast(netdev_get_dev(netdev));
1750 const struct tc_ops *new_ops;
1753 new_ops = tc_lookup_ovs_name(type);
1754 if (!new_ops || !new_ops->tc_install) {
1758 error = tc_query_qdisc(netdev);
1763 if (new_ops == netdev_dev->tc->ops) {
1764 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1766 /* Delete existing qdisc. */
1767 error = tc_del_qdisc(netdev);
1771 assert(netdev_dev->tc == NULL);
1773 /* Install new qdisc. */
1774 error = new_ops->tc_install(netdev, details);
1775 assert((error == 0) == (netdev_dev->tc != NULL));
1782 netdev_linux_get_queue(const struct netdev *netdev,
1783 unsigned int queue_id, struct shash *details)
1785 struct netdev_dev_linux *netdev_dev =
1786 netdev_dev_linux_cast(netdev_get_dev(netdev));
1789 error = tc_query_qdisc(netdev);
1793 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1795 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1801 netdev_linux_set_queue(struct netdev *netdev,
1802 unsigned int queue_id, const struct shash *details)
1804 struct netdev_dev_linux *netdev_dev =
1805 netdev_dev_linux_cast(netdev_get_dev(netdev));
1808 error = tc_query_qdisc(netdev);
1811 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1812 || !netdev_dev->tc->ops->class_set) {
1816 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1820 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1822 struct netdev_dev_linux *netdev_dev =
1823 netdev_dev_linux_cast(netdev_get_dev(netdev));
1826 error = tc_query_qdisc(netdev);
1829 } else if (!netdev_dev->tc->ops->class_delete) {
1832 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1834 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1840 netdev_linux_get_queue_stats(const struct netdev *netdev,
1841 unsigned int queue_id,
1842 struct netdev_queue_stats *stats)
1844 struct netdev_dev_linux *netdev_dev =
1845 netdev_dev_linux_cast(netdev_get_dev(netdev));
1848 error = tc_query_qdisc(netdev);
1851 } else if (!netdev_dev->tc->ops->class_get_stats) {
1854 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1856 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1862 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1864 struct ofpbuf request;
1865 struct tcmsg *tcmsg;
1867 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1871 tcmsg->tcm_parent = 0;
1872 nl_dump_start(dump, rtnl_sock, &request);
1873 ofpbuf_uninit(&request);
1878 netdev_linux_dump_queues(const struct netdev *netdev,
1879 netdev_dump_queues_cb *cb, void *aux)
1881 struct netdev_dev_linux *netdev_dev =
1882 netdev_dev_linux_cast(netdev_get_dev(netdev));
1883 struct tc_queue *queue;
1884 struct shash details;
1888 error = tc_query_qdisc(netdev);
1891 } else if (!netdev_dev->tc->ops->class_get) {
1896 shash_init(&details);
1897 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1898 shash_clear(&details);
1900 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1902 (*cb)(queue->queue_id, &details, aux);
1907 shash_destroy(&details);
1913 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1914 netdev_dump_queue_stats_cb *cb, void *aux)
1916 struct netdev_dev_linux *netdev_dev =
1917 netdev_dev_linux_cast(netdev_get_dev(netdev));
1918 struct nl_dump dump;
1923 error = tc_query_qdisc(netdev);
1926 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1931 if (!start_queue_dump(netdev, &dump)) {
1934 while (nl_dump_next(&dump, &msg)) {
1935 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1941 error = nl_dump_done(&dump);
1942 return error ? error : last_error;
1946 netdev_linux_get_in4(const struct netdev *netdev_,
1947 struct in_addr *address, struct in_addr *netmask)
1949 struct netdev_dev_linux *netdev_dev =
1950 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1952 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1955 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1956 SIOCGIFADDR, "SIOCGIFADDR");
1961 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1962 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1967 netdev_dev->cache_valid |= VALID_IN4;
1969 *address = netdev_dev->address;
1970 *netmask = netdev_dev->netmask;
1971 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1975 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1976 struct in_addr netmask)
1978 struct netdev_dev_linux *netdev_dev =
1979 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1982 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1984 netdev_dev->cache_valid |= VALID_IN4;
1985 netdev_dev->address = address;
1986 netdev_dev->netmask = netmask;
1987 if (address.s_addr != INADDR_ANY) {
1988 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1989 "SIOCSIFNETMASK", netmask);
1996 parse_if_inet6_line(const char *line,
1997 struct in6_addr *in6, char ifname[16 + 1])
1999 uint8_t *s6 = in6->s6_addr;
2000 #define X8 "%2"SCNx8
2002 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2003 "%*x %*x %*x %*x %16s\n",
2004 &s6[0], &s6[1], &s6[2], &s6[3],
2005 &s6[4], &s6[5], &s6[6], &s6[7],
2006 &s6[8], &s6[9], &s6[10], &s6[11],
2007 &s6[12], &s6[13], &s6[14], &s6[15],
2011 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2012 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2014 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2016 struct netdev_dev_linux *netdev_dev =
2017 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2018 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2022 netdev_dev->in6 = in6addr_any;
2024 file = fopen("/proc/net/if_inet6", "r");
2026 const char *name = netdev_get_name(netdev_);
2027 while (fgets(line, sizeof line, file)) {
2028 struct in6_addr in6_tmp;
2029 char ifname[16 + 1];
2030 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2031 && !strcmp(name, ifname))
2033 netdev_dev->in6 = in6_tmp;
2039 netdev_dev->cache_valid |= VALID_IN6;
2041 *in6 = netdev_dev->in6;
2046 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2048 struct sockaddr_in sin;
2049 memset(&sin, 0, sizeof sin);
2050 sin.sin_family = AF_INET;
2051 sin.sin_addr = addr;
2054 memset(sa, 0, sizeof *sa);
2055 memcpy(sa, &sin, sizeof sin);
2059 do_set_addr(struct netdev *netdev,
2060 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2063 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2064 make_in4_sockaddr(&ifr.ifr_addr, addr);
2066 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2070 /* Adds 'router' as a default IP gateway. */
2072 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2074 struct in_addr any = { INADDR_ANY };
2078 memset(&rt, 0, sizeof rt);
2079 make_in4_sockaddr(&rt.rt_dst, any);
2080 make_in4_sockaddr(&rt.rt_gateway, router);
2081 make_in4_sockaddr(&rt.rt_genmask, any);
2082 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2083 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2085 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2091 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2094 static const char fn[] = "/proc/net/route";
2099 *netdev_name = NULL;
2100 stream = fopen(fn, "r");
2101 if (stream == NULL) {
2102 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2107 while (fgets(line, sizeof line, stream)) {
2110 ovs_be32 dest, gateway, mask;
2111 int refcnt, metric, mtu;
2112 unsigned int flags, use, window, irtt;
2115 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2117 iface, &dest, &gateway, &flags, &refcnt,
2118 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2120 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2124 if (!(flags & RTF_UP)) {
2125 /* Skip routes that aren't up. */
2129 /* The output of 'dest', 'mask', and 'gateway' were given in
2130 * network byte order, so we don't need need any endian
2131 * conversions here. */
2132 if ((dest & mask) == (host->s_addr & mask)) {
2134 /* The host is directly reachable. */
2135 next_hop->s_addr = 0;
2137 /* To reach the host, we must go through a gateway. */
2138 next_hop->s_addr = gateway;
2140 *netdev_name = xstrdup(iface);
2152 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2154 struct ethtool_drvinfo drvinfo;
2157 memset(&drvinfo, 0, sizeof drvinfo);
2158 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2159 (struct ethtool_cmd *)&drvinfo,
2161 "ETHTOOL_GDRVINFO");
2163 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2164 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2165 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2171 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2172 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2173 * returns 0. Otherwise, it returns a positive errno value; in particular,
2174 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2176 netdev_linux_arp_lookup(const struct netdev *netdev,
2177 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2180 struct sockaddr_in sin;
2183 memset(&r, 0, sizeof r);
2184 memset(&sin, 0, sizeof sin);
2185 sin.sin_family = AF_INET;
2186 sin.sin_addr.s_addr = ip;
2188 memcpy(&r.arp_pa, &sin, sizeof sin);
2189 r.arp_ha.sa_family = ARPHRD_ETHER;
2191 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2192 COVERAGE_INC(netdev_arp_lookup);
2193 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2195 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2196 } else if (retval != ENXIO) {
2197 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2198 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2204 nd_to_iff_flags(enum netdev_flags nd)
2207 if (nd & NETDEV_UP) {
2210 if (nd & NETDEV_PROMISC) {
2217 iff_to_nd_flags(int iff)
2219 enum netdev_flags nd = 0;
2223 if (iff & IFF_PROMISC) {
2224 nd |= NETDEV_PROMISC;
2230 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2231 enum netdev_flags on, enum netdev_flags *old_flagsp)
2233 struct netdev_dev_linux *netdev_dev;
2234 int old_flags, new_flags;
2237 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2238 old_flags = netdev_dev->ifi_flags;
2239 *old_flagsp = iff_to_nd_flags(old_flags);
2240 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2241 if (new_flags != old_flags) {
2242 error = set_flags(netdev, new_flags);
2243 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2249 netdev_linux_change_seq(const struct netdev *netdev)
2251 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2254 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2258 netdev_linux_init, \
2260 netdev_linux_wait, \
2263 netdev_linux_destroy, \
2264 NULL, /* get_config */ \
2265 NULL, /* set_config */ \
2267 netdev_linux_open, \
2268 netdev_linux_close, \
2270 netdev_linux_listen, \
2271 netdev_linux_recv, \
2272 netdev_linux_recv_wait, \
2273 netdev_linux_drain, \
2275 netdev_linux_send, \
2276 netdev_linux_send_wait, \
2278 netdev_linux_set_etheraddr, \
2279 netdev_linux_get_etheraddr, \
2280 netdev_linux_get_mtu, \
2281 netdev_linux_set_mtu, \
2282 netdev_linux_get_ifindex, \
2283 netdev_linux_get_carrier, \
2284 netdev_linux_get_carrier_resets, \
2285 netdev_linux_set_miimon_interval, \
2289 netdev_linux_get_features, \
2290 netdev_linux_set_advertisements, \
2292 netdev_linux_set_policing, \
2293 netdev_linux_get_qos_types, \
2294 netdev_linux_get_qos_capabilities, \
2295 netdev_linux_get_qos, \
2296 netdev_linux_set_qos, \
2297 netdev_linux_get_queue, \
2298 netdev_linux_set_queue, \
2299 netdev_linux_delete_queue, \
2300 netdev_linux_get_queue_stats, \
2301 netdev_linux_dump_queues, \
2302 netdev_linux_dump_queue_stats, \
2304 netdev_linux_get_in4, \
2305 netdev_linux_set_in4, \
2306 netdev_linux_get_in6, \
2307 netdev_linux_add_router, \
2308 netdev_linux_get_next_hop, \
2309 netdev_linux_get_status, \
2310 netdev_linux_arp_lookup, \
2312 netdev_linux_update_flags, \
2314 netdev_linux_change_seq \
2317 const struct netdev_class netdev_linux_class =
2320 netdev_linux_create,
2321 netdev_linux_get_stats,
2322 NULL); /* set_stats */
2324 const struct netdev_class netdev_tap_class =
2327 netdev_linux_create_tap,
2328 netdev_tap_get_stats,
2329 NULL); /* set_stats */
2331 const struct netdev_class netdev_internal_class =
2334 netdev_linux_create,
2335 netdev_internal_get_stats,
2336 netdev_vport_set_stats);
2338 /* HTB traffic control class. */
2340 #define HTB_N_QUEUES 0xf000
2344 unsigned int max_rate; /* In bytes/s. */
2348 struct tc_queue tc_queue;
2349 unsigned int min_rate; /* In bytes/s. */
2350 unsigned int max_rate; /* In bytes/s. */
2351 unsigned int burst; /* In bytes. */
2352 unsigned int priority; /* Lower values are higher priorities. */
2356 htb_get__(const struct netdev *netdev)
2358 struct netdev_dev_linux *netdev_dev =
2359 netdev_dev_linux_cast(netdev_get_dev(netdev));
2360 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2364 htb_install__(struct netdev *netdev, uint64_t max_rate)
2366 struct netdev_dev_linux *netdev_dev =
2367 netdev_dev_linux_cast(netdev_get_dev(netdev));
2370 htb = xmalloc(sizeof *htb);
2371 tc_init(&htb->tc, &tc_ops_htb);
2372 htb->max_rate = max_rate;
2374 netdev_dev->tc = &htb->tc;
2377 /* Create an HTB qdisc.
2379 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2381 htb_setup_qdisc__(struct netdev *netdev)
2384 struct tc_htb_glob opt;
2385 struct ofpbuf request;
2386 struct tcmsg *tcmsg;
2388 tc_del_qdisc(netdev);
2390 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2391 NLM_F_EXCL | NLM_F_CREATE, &request);
2395 tcmsg->tcm_handle = tc_make_handle(1, 0);
2396 tcmsg->tcm_parent = TC_H_ROOT;
2398 nl_msg_put_string(&request, TCA_KIND, "htb");
2400 memset(&opt, 0, sizeof opt);
2401 opt.rate2quantum = 10;
2405 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2406 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2407 nl_msg_end_nested(&request, opt_offset);
2409 return tc_transact(&request, NULL);
2412 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2413 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2415 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2416 unsigned int parent, struct htb_class *class)
2419 struct tc_htb_opt opt;
2420 struct ofpbuf request;
2421 struct tcmsg *tcmsg;
2425 error = netdev_get_mtu(netdev, &mtu);
2427 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2428 netdev_get_name(netdev));
2432 memset(&opt, 0, sizeof opt);
2433 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2434 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2435 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2436 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2437 opt.prio = class->priority;
2439 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2443 tcmsg->tcm_handle = handle;
2444 tcmsg->tcm_parent = parent;
2446 nl_msg_put_string(&request, TCA_KIND, "htb");
2447 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2448 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2449 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2450 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2451 nl_msg_end_nested(&request, opt_offset);
2453 error = tc_transact(&request, NULL);
2455 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2456 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2457 netdev_get_name(netdev),
2458 tc_get_major(handle), tc_get_minor(handle),
2459 tc_get_major(parent), tc_get_minor(parent),
2460 class->min_rate, class->max_rate,
2461 class->burst, class->priority, strerror(error));
2466 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2467 * description of them into 'details'. The description complies with the
2468 * specification given in the vswitch database documentation for linux-htb
2471 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2473 static const struct nl_policy tca_htb_policy[] = {
2474 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2475 .min_len = sizeof(struct tc_htb_opt) },
2478 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2479 const struct tc_htb_opt *htb;
2481 if (!nl_parse_nested(nl_options, tca_htb_policy,
2482 attrs, ARRAY_SIZE(tca_htb_policy))) {
2483 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2487 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2488 class->min_rate = htb->rate.rate;
2489 class->max_rate = htb->ceil.rate;
2490 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2491 class->priority = htb->prio;
2496 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2497 struct htb_class *options,
2498 struct netdev_queue_stats *stats)
2500 struct nlattr *nl_options;
2501 unsigned int handle;
2504 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2505 if (!error && queue_id) {
2506 unsigned int major = tc_get_major(handle);
2507 unsigned int minor = tc_get_minor(handle);
2508 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2509 *queue_id = minor - 1;
2514 if (!error && options) {
2515 error = htb_parse_tca_options__(nl_options, options);
2521 htb_parse_qdisc_details__(struct netdev *netdev,
2522 const struct shash *details, struct htb_class *hc)
2524 const char *max_rate_s;
2526 max_rate_s = shash_find_data(details, "max-rate");
2527 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2528 if (!hc->max_rate) {
2531 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2532 hc->max_rate = netdev_features_to_bps(current) / 8;
2534 hc->min_rate = hc->max_rate;
2540 htb_parse_class_details__(struct netdev *netdev,
2541 const struct shash *details, struct htb_class *hc)
2543 const struct htb *htb = htb_get__(netdev);
2544 const char *min_rate_s = shash_find_data(details, "min-rate");
2545 const char *max_rate_s = shash_find_data(details, "max-rate");
2546 const char *burst_s = shash_find_data(details, "burst");
2547 const char *priority_s = shash_find_data(details, "priority");
2550 error = netdev_get_mtu(netdev, &mtu);
2552 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2553 netdev_get_name(netdev));
2557 /* HTB requires at least an mtu sized min-rate to send any traffic even
2558 * on uncongested links. */
2559 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2560 hc->min_rate = MAX(hc->min_rate, mtu);
2561 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2564 hc->max_rate = (max_rate_s
2565 ? strtoull(max_rate_s, NULL, 10) / 8
2567 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2568 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2572 * According to hints in the documentation that I've read, it is important
2573 * that 'burst' be at least as big as the largest frame that might be
2574 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2575 * but having it a bit too small is a problem. Since netdev_get_mtu()
2576 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2577 * the MTU. We actually add 64, instead of 14, as a guard against
2578 * additional headers get tacked on somewhere that we're not aware of. */
2579 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2580 hc->burst = MAX(hc->burst, mtu + 64);
2583 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2589 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2590 unsigned int parent, struct htb_class *options,
2591 struct netdev_queue_stats *stats)
2593 struct ofpbuf *reply;
2596 error = tc_query_class(netdev, handle, parent, &reply);
2598 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2599 ofpbuf_delete(reply);
2605 htb_tc_install(struct netdev *netdev, const struct shash *details)
2609 error = htb_setup_qdisc__(netdev);
2611 struct htb_class hc;
2613 htb_parse_qdisc_details__(netdev, details, &hc);
2614 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2615 tc_make_handle(1, 0), &hc);
2617 htb_install__(netdev, hc.max_rate);
2623 static struct htb_class *
2624 htb_class_cast__(const struct tc_queue *queue)
2626 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2630 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2631 const struct htb_class *hc)
2633 struct htb *htb = htb_get__(netdev);
2634 size_t hash = hash_int(queue_id, 0);
2635 struct tc_queue *queue;
2636 struct htb_class *hcp;
2638 queue = tc_find_queue__(netdev, queue_id, hash);
2640 hcp = htb_class_cast__(queue);
2642 hcp = xmalloc(sizeof *hcp);
2643 queue = &hcp->tc_queue;
2644 queue->queue_id = queue_id;
2645 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2648 hcp->min_rate = hc->min_rate;
2649 hcp->max_rate = hc->max_rate;
2650 hcp->burst = hc->burst;
2651 hcp->priority = hc->priority;
2655 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2658 struct nl_dump dump;
2659 struct htb_class hc;
2661 /* Get qdisc options. */
2663 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2664 htb_install__(netdev, hc.max_rate);
2667 if (!start_queue_dump(netdev, &dump)) {
2670 while (nl_dump_next(&dump, &msg)) {
2671 unsigned int queue_id;
2673 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2674 htb_update_queue__(netdev, queue_id, &hc);
2677 nl_dump_done(&dump);
2683 htb_tc_destroy(struct tc *tc)
2685 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2686 struct htb_class *hc, *next;
2688 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2689 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2697 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2699 const struct htb *htb = htb_get__(netdev);
2700 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2705 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2707 struct htb_class hc;
2710 htb_parse_qdisc_details__(netdev, details, &hc);
2711 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2712 tc_make_handle(1, 0), &hc);
2714 htb_get__(netdev)->max_rate = hc.max_rate;
2720 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2721 const struct tc_queue *queue, struct shash *details)
2723 const struct htb_class *hc = htb_class_cast__(queue);
2725 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2726 if (hc->min_rate != hc->max_rate) {
2727 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2729 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2731 shash_add(details, "priority", xasprintf("%u", hc->priority));
2737 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2738 const struct shash *details)
2740 struct htb_class hc;
2743 error = htb_parse_class_details__(netdev, details, &hc);
2748 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2749 tc_make_handle(1, 0xfffe), &hc);
2754 htb_update_queue__(netdev, queue_id, &hc);
2759 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2761 struct htb_class *hc = htb_class_cast__(queue);
2762 struct htb *htb = htb_get__(netdev);
2765 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2767 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2774 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2775 struct netdev_queue_stats *stats)
2777 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2778 tc_make_handle(1, 0xfffe), NULL, stats);
2782 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2783 const struct ofpbuf *nlmsg,
2784 netdev_dump_queue_stats_cb *cb, void *aux)
2786 struct netdev_queue_stats stats;
2787 unsigned int handle, major, minor;
2790 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2795 major = tc_get_major(handle);
2796 minor = tc_get_minor(handle);
2797 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2798 (*cb)(minor - 1, &stats, aux);
2803 static const struct tc_ops tc_ops_htb = {
2804 "htb", /* linux_name */
2805 "linux-htb", /* ovs_name */
2806 HTB_N_QUEUES, /* n_queues */
2815 htb_class_get_stats,
2816 htb_class_dump_stats
2819 /* "linux-hfsc" traffic control class. */
2821 #define HFSC_N_QUEUES 0xf000
2829 struct tc_queue tc_queue;
2834 static struct hfsc *
2835 hfsc_get__(const struct netdev *netdev)
2837 struct netdev_dev_linux *netdev_dev;
2838 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2839 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2842 static struct hfsc_class *
2843 hfsc_class_cast__(const struct tc_queue *queue)
2845 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2849 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2851 struct netdev_dev_linux * netdev_dev;
2854 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2855 hfsc = xmalloc(sizeof *hfsc);
2856 tc_init(&hfsc->tc, &tc_ops_hfsc);
2857 hfsc->max_rate = max_rate;
2858 netdev_dev->tc = &hfsc->tc;
2862 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2863 const struct hfsc_class *hc)
2867 struct hfsc_class *hcp;
2868 struct tc_queue *queue;
2870 hfsc = hfsc_get__(netdev);
2871 hash = hash_int(queue_id, 0);
2873 queue = tc_find_queue__(netdev, queue_id, hash);
2875 hcp = hfsc_class_cast__(queue);
2877 hcp = xmalloc(sizeof *hcp);
2878 queue = &hcp->tc_queue;
2879 queue->queue_id = queue_id;
2880 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2883 hcp->min_rate = hc->min_rate;
2884 hcp->max_rate = hc->max_rate;
2888 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2890 const struct tc_service_curve *rsc, *fsc, *usc;
2891 static const struct nl_policy tca_hfsc_policy[] = {
2893 .type = NL_A_UNSPEC,
2895 .min_len = sizeof(struct tc_service_curve),
2898 .type = NL_A_UNSPEC,
2900 .min_len = sizeof(struct tc_service_curve),
2903 .type = NL_A_UNSPEC,
2905 .min_len = sizeof(struct tc_service_curve),
2908 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2910 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2911 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2912 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2916 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2917 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2918 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2920 if (rsc->m1 != 0 || rsc->d != 0 ||
2921 fsc->m1 != 0 || fsc->d != 0 ||
2922 usc->m1 != 0 || usc->d != 0) {
2923 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2924 "Non-linear service curves are not supported.");
2928 if (rsc->m2 != fsc->m2) {
2929 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2930 "Real-time service curves are not supported ");
2934 if (rsc->m2 > usc->m2) {
2935 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2936 "Min-rate service curve is greater than "
2937 "the max-rate service curve.");
2941 class->min_rate = fsc->m2;
2942 class->max_rate = usc->m2;
2947 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2948 struct hfsc_class *options,
2949 struct netdev_queue_stats *stats)
2952 unsigned int handle;
2953 struct nlattr *nl_options;
2955 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2961 unsigned int major, minor;
2963 major = tc_get_major(handle);
2964 minor = tc_get_minor(handle);
2965 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2966 *queue_id = minor - 1;
2973 error = hfsc_parse_tca_options__(nl_options, options);
2980 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2981 unsigned int parent, struct hfsc_class *options,
2982 struct netdev_queue_stats *stats)
2985 struct ofpbuf *reply;
2987 error = tc_query_class(netdev, handle, parent, &reply);
2992 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2993 ofpbuf_delete(reply);
2998 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2999 struct hfsc_class *class)
3002 const char *max_rate_s;
3004 max_rate_s = shash_find_data(details, "max-rate");
3005 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3010 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3011 max_rate = netdev_features_to_bps(current) / 8;
3014 class->min_rate = max_rate;
3015 class->max_rate = max_rate;
3019 hfsc_parse_class_details__(struct netdev *netdev,
3020 const struct shash *details,
3021 struct hfsc_class * class)
3023 const struct hfsc *hfsc;
3024 uint32_t min_rate, max_rate;
3025 const char *min_rate_s, *max_rate_s;
3027 hfsc = hfsc_get__(netdev);
3028 min_rate_s = shash_find_data(details, "min-rate");
3029 max_rate_s = shash_find_data(details, "max-rate");
3031 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3032 min_rate = MAX(min_rate, 1);
3033 min_rate = MIN(min_rate, hfsc->max_rate);
3035 max_rate = (max_rate_s
3036 ? strtoull(max_rate_s, NULL, 10) / 8
3038 max_rate = MAX(max_rate, min_rate);
3039 max_rate = MIN(max_rate, hfsc->max_rate);
3041 class->min_rate = min_rate;
3042 class->max_rate = max_rate;
3047 /* Create an HFSC qdisc.
3049 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3051 hfsc_setup_qdisc__(struct netdev * netdev)
3053 struct tcmsg *tcmsg;
3054 struct ofpbuf request;
3055 struct tc_hfsc_qopt opt;
3057 tc_del_qdisc(netdev);
3059 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3060 NLM_F_EXCL | NLM_F_CREATE, &request);
3066 tcmsg->tcm_handle = tc_make_handle(1, 0);
3067 tcmsg->tcm_parent = TC_H_ROOT;
3069 memset(&opt, 0, sizeof opt);
3072 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3073 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3075 return tc_transact(&request, NULL);
3078 /* Create an HFSC class.
3080 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3081 * sc rate <min_rate> ul rate <max_rate>" */
3083 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3084 unsigned int parent, struct hfsc_class *class)
3088 struct tcmsg *tcmsg;
3089 struct ofpbuf request;
3090 struct tc_service_curve min, max;
3092 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3098 tcmsg->tcm_handle = handle;
3099 tcmsg->tcm_parent = parent;
3103 min.m2 = class->min_rate;
3107 max.m2 = class->max_rate;
3109 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3110 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3111 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3112 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3113 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3114 nl_msg_end_nested(&request, opt_offset);
3116 error = tc_transact(&request, NULL);
3118 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3119 "min-rate %ubps, max-rate %ubps (%s)",
3120 netdev_get_name(netdev),
3121 tc_get_major(handle), tc_get_minor(handle),
3122 tc_get_major(parent), tc_get_minor(parent),
3123 class->min_rate, class->max_rate, strerror(error));
3130 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3133 struct hfsc_class class;
3135 error = hfsc_setup_qdisc__(netdev);
3141 hfsc_parse_qdisc_details__(netdev, details, &class);
3142 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3143 tc_make_handle(1, 0), &class);
3149 hfsc_install__(netdev, class.max_rate);
3154 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3157 struct nl_dump dump;
3158 struct hfsc_class hc;
3161 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3162 hfsc_install__(netdev, hc.max_rate);
3164 if (!start_queue_dump(netdev, &dump)) {
3168 while (nl_dump_next(&dump, &msg)) {
3169 unsigned int queue_id;
3171 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3172 hfsc_update_queue__(netdev, queue_id, &hc);
3176 nl_dump_done(&dump);
3181 hfsc_tc_destroy(struct tc *tc)
3184 struct hfsc_class *hc, *next;
3186 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3188 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3189 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3198 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3200 const struct hfsc *hfsc;
3201 hfsc = hfsc_get__(netdev);
3202 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3207 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3210 struct hfsc_class class;
3212 hfsc_parse_qdisc_details__(netdev, details, &class);
3213 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3214 tc_make_handle(1, 0), &class);
3217 hfsc_get__(netdev)->max_rate = class.max_rate;
3224 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3225 const struct tc_queue *queue, struct shash *details)
3227 const struct hfsc_class *hc;
3229 hc = hfsc_class_cast__(queue);
3230 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3231 if (hc->min_rate != hc->max_rate) {
3232 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3238 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3239 const struct shash *details)
3242 struct hfsc_class class;
3244 error = hfsc_parse_class_details__(netdev, details, &class);
3249 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3250 tc_make_handle(1, 0xfffe), &class);
3255 hfsc_update_queue__(netdev, queue_id, &class);
3260 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3264 struct hfsc_class *hc;
3266 hc = hfsc_class_cast__(queue);
3267 hfsc = hfsc_get__(netdev);
3269 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3271 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3278 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3279 struct netdev_queue_stats *stats)
3281 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3282 tc_make_handle(1, 0xfffe), NULL, stats);
3286 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3287 const struct ofpbuf *nlmsg,
3288 netdev_dump_queue_stats_cb *cb, void *aux)
3290 struct netdev_queue_stats stats;
3291 unsigned int handle, major, minor;
3294 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3299 major = tc_get_major(handle);
3300 minor = tc_get_minor(handle);
3301 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3302 (*cb)(minor - 1, &stats, aux);
3307 static const struct tc_ops tc_ops_hfsc = {
3308 "hfsc", /* linux_name */
3309 "linux-hfsc", /* ovs_name */
3310 HFSC_N_QUEUES, /* n_queues */
3311 hfsc_tc_install, /* tc_install */
3312 hfsc_tc_load, /* tc_load */
3313 hfsc_tc_destroy, /* tc_destroy */
3314 hfsc_qdisc_get, /* qdisc_get */
3315 hfsc_qdisc_set, /* qdisc_set */
3316 hfsc_class_get, /* class_get */
3317 hfsc_class_set, /* class_set */
3318 hfsc_class_delete, /* class_delete */
3319 hfsc_class_get_stats, /* class_get_stats */
3320 hfsc_class_dump_stats /* class_dump_stats */
3323 /* "linux-default" traffic control class.
3325 * This class represents the default, unnamed Linux qdisc. It corresponds to
3326 * the "" (empty string) QoS type in the OVS database. */
3329 default_install__(struct netdev *netdev)
3331 struct netdev_dev_linux *netdev_dev =
3332 netdev_dev_linux_cast(netdev_get_dev(netdev));
3333 static struct tc *tc;
3336 tc = xmalloc(sizeof *tc);
3337 tc_init(tc, &tc_ops_default);
3339 netdev_dev->tc = tc;
3343 default_tc_install(struct netdev *netdev,
3344 const struct shash *details OVS_UNUSED)
3346 default_install__(netdev);
3351 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3353 default_install__(netdev);
3357 static const struct tc_ops tc_ops_default = {
3358 NULL, /* linux_name */
3363 NULL, /* tc_destroy */
3364 NULL, /* qdisc_get */
3365 NULL, /* qdisc_set */
3366 NULL, /* class_get */
3367 NULL, /* class_set */
3368 NULL, /* class_delete */
3369 NULL, /* class_get_stats */
3370 NULL /* class_dump_stats */
3373 /* "linux-other" traffic control class.
3378 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3380 struct netdev_dev_linux *netdev_dev =
3381 netdev_dev_linux_cast(netdev_get_dev(netdev));
3382 static struct tc *tc;
3385 tc = xmalloc(sizeof *tc);
3386 tc_init(tc, &tc_ops_other);
3388 netdev_dev->tc = tc;
3392 static const struct tc_ops tc_ops_other = {
3393 NULL, /* linux_name */
3394 "linux-other", /* ovs_name */
3396 NULL, /* tc_install */
3398 NULL, /* tc_destroy */
3399 NULL, /* qdisc_get */
3400 NULL, /* qdisc_set */
3401 NULL, /* class_get */
3402 NULL, /* class_set */
3403 NULL, /* class_delete */
3404 NULL, /* class_get_stats */
3405 NULL /* class_dump_stats */
3408 /* Traffic control. */
3410 /* Number of kernel "tc" ticks per second. */
3411 static double ticks_per_s;
3413 /* Number of kernel "jiffies" per second. This is used for the purpose of
3414 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3415 * one jiffy's worth of data.
3417 * There are two possibilities here:
3419 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3420 * approximate range of 100 to 1024. That means that we really need to
3421 * make sure that the qdisc can buffer that much data.
3423 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3424 * has finely granular timers and there's no need to fudge additional room
3425 * for buffers. (There's no extra effort needed to implement that: the
3426 * large 'buffer_hz' is used as a divisor, so practically any number will
3427 * come out as 0 in the division. Small integer results in the case of
3428 * really high dividends won't have any real effect anyhow.)
3430 static unsigned int buffer_hz;
3432 /* Returns tc handle 'major':'minor'. */
3434 tc_make_handle(unsigned int major, unsigned int minor)
3436 return TC_H_MAKE(major << 16, minor);
3439 /* Returns the major number from 'handle'. */
3441 tc_get_major(unsigned int handle)
3443 return TC_H_MAJ(handle) >> 16;
3446 /* Returns the minor number from 'handle'. */
3448 tc_get_minor(unsigned int handle)
3450 return TC_H_MIN(handle);
3453 static struct tcmsg *
3454 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3455 struct ofpbuf *request)
3457 struct tcmsg *tcmsg;
3461 error = get_ifindex(netdev, &ifindex);
3466 ofpbuf_init(request, 512);
3467 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3468 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3469 tcmsg->tcm_family = AF_UNSPEC;
3470 tcmsg->tcm_ifindex = ifindex;
3471 /* Caller should fill in tcmsg->tcm_handle. */
3472 /* Caller should fill in tcmsg->tcm_parent. */
3478 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3480 int error = nl_sock_transact(rtnl_sock, request, replyp);
3481 ofpbuf_uninit(request);
3485 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3486 * policing configuration.
3488 * This function is equivalent to running the following when 'add' is true:
3489 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3491 * This function is equivalent to running the following when 'add' is false:
3492 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3494 * The configuration and stats may be seen with the following command:
3495 * /sbin/tc -s qdisc show dev <devname>
3497 * Returns 0 if successful, otherwise a positive errno value.
3500 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3502 struct ofpbuf request;
3503 struct tcmsg *tcmsg;
3505 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3506 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3508 tcmsg = tc_make_request(netdev, type, flags, &request);
3512 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3513 tcmsg->tcm_parent = TC_H_INGRESS;
3514 nl_msg_put_string(&request, TCA_KIND, "ingress");
3515 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3517 error = tc_transact(&request, NULL);
3519 /* If we're deleting the qdisc, don't worry about some of the
3520 * error conditions. */
3521 if (!add && (error == ENOENT || error == EINVAL)) {
3530 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3533 * This function is equivalent to running:
3534 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3535 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3538 * The configuration and stats may be seen with the following command:
3539 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3541 * Returns 0 if successful, otherwise a positive errno value.
3544 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3546 struct tc_police tc_police;
3547 struct ofpbuf request;
3548 struct tcmsg *tcmsg;
3549 size_t basic_offset;
3550 size_t police_offset;
3554 memset(&tc_police, 0, sizeof tc_police);
3555 tc_police.action = TC_POLICE_SHOT;
3556 tc_police.mtu = mtu;
3557 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3558 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3559 kbits_burst * 1024);
3561 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3562 NLM_F_EXCL | NLM_F_CREATE, &request);
3566 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3567 tcmsg->tcm_info = tc_make_handle(49,
3568 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3570 nl_msg_put_string(&request, TCA_KIND, "basic");
3571 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3572 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3573 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3574 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3575 nl_msg_end_nested(&request, police_offset);
3576 nl_msg_end_nested(&request, basic_offset);
3578 error = tc_transact(&request, NULL);
3589 /* The values in psched are not individually very meaningful, but they are
3590 * important. The tables below show some values seen in the wild.
3594 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3595 * (Before that, there are hints that it was 1000000000.)
3597 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3601 * -----------------------------------
3602 * [1] 000c8000 000f4240 000f4240 00000064
3603 * [2] 000003e8 00000400 000f4240 3b9aca00
3604 * [3] 000003e8 00000400 000f4240 3b9aca00
3605 * [4] 000003e8 00000400 000f4240 00000064
3606 * [5] 000003e8 00000040 000f4240 3b9aca00
3607 * [6] 000003e8 00000040 000f4240 000000f9
3609 * a b c d ticks_per_s buffer_hz
3610 * ------- --------- ---------- ------------- ----------- -------------
3611 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3612 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3613 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3614 * [4] 1,000 1,024 1,000,000 100 976,562 100
3615 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3616 * [6] 1,000 64 1,000,000 249 15,625,000 249
3618 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3619 * [2] 2.6.26-1-686-bigmem from Debian lenny
3620 * [3] 2.6.26-2-sparc64 from Debian lenny
3621 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3622 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3623 * [6] 2.6.34 from kernel.org on KVM
3625 static const char fn[] = "/proc/net/psched";
3626 unsigned int a, b, c, d;
3632 stream = fopen(fn, "r");
3634 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3638 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3639 VLOG_WARN("%s: read failed", fn);
3643 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3647 VLOG_WARN("%s: invalid scheduler parameters", fn);
3651 ticks_per_s = (double) a * c / b;
3655 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3658 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3661 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3662 * rate of 'rate' bytes per second. */
3664 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3669 return (rate * ticks) / ticks_per_s;
3672 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3673 * rate of 'rate' bytes per second. */
3675 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3680 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3683 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3684 * a transmission rate of 'rate' bytes per second. */
3686 tc_buffer_per_jiffy(unsigned int rate)
3691 return rate / buffer_hz;
3694 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3695 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3696 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3697 * stores NULL into it if it is absent.
3699 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3702 * Returns 0 if successful, otherwise a positive errno value. */
3704 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3705 struct nlattr **options)
3707 static const struct nl_policy tca_policy[] = {
3708 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3709 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3711 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3713 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3714 tca_policy, ta, ARRAY_SIZE(ta))) {
3715 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3720 *kind = nl_attr_get_string(ta[TCA_KIND]);
3724 *options = ta[TCA_OPTIONS];
3739 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3740 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3741 * into '*options', and its queue statistics into '*stats'. Any of the output
3742 * arguments may be null.
3744 * Returns 0 if successful, otherwise a positive errno value. */
3746 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3747 struct nlattr **options, struct netdev_queue_stats *stats)
3749 static const struct nl_policy tca_policy[] = {
3750 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3751 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3753 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3755 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3756 tca_policy, ta, ARRAY_SIZE(ta))) {
3757 VLOG_WARN_RL(&rl, "failed to parse class message");
3762 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3763 *handlep = tc->tcm_handle;
3767 *options = ta[TCA_OPTIONS];
3771 const struct gnet_stats_queue *gsq;
3772 struct gnet_stats_basic gsb;
3774 static const struct nl_policy stats_policy[] = {
3775 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3776 .min_len = sizeof gsb },
3777 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3778 .min_len = sizeof *gsq },
3780 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3782 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3783 sa, ARRAY_SIZE(sa))) {
3784 VLOG_WARN_RL(&rl, "failed to parse class stats");
3788 /* Alignment issues screw up the length of struct gnet_stats_basic on
3789 * some arch/bitsize combinations. Newer versions of Linux have a
3790 * struct gnet_stats_basic_packed, but we can't depend on that. The
3791 * easiest thing to do is just to make a copy. */
3792 memset(&gsb, 0, sizeof gsb);
3793 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3794 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3795 stats->tx_bytes = gsb.bytes;
3796 stats->tx_packets = gsb.packets;
3798 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3799 stats->tx_errors = gsq->drops;
3809 memset(stats, 0, sizeof *stats);
3814 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3817 tc_query_class(const struct netdev *netdev,
3818 unsigned int handle, unsigned int parent,
3819 struct ofpbuf **replyp)
3821 struct ofpbuf request;
3822 struct tcmsg *tcmsg;
3825 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3829 tcmsg->tcm_handle = handle;
3830 tcmsg->tcm_parent = parent;
3832 error = tc_transact(&request, replyp);
3834 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3835 netdev_get_name(netdev),
3836 tc_get_major(handle), tc_get_minor(handle),
3837 tc_get_major(parent), tc_get_minor(parent),
3843 /* Equivalent to "tc class del dev <name> handle <handle>". */
3845 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3847 struct ofpbuf request;
3848 struct tcmsg *tcmsg;
3851 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3855 tcmsg->tcm_handle = handle;
3856 tcmsg->tcm_parent = 0;
3858 error = tc_transact(&request, NULL);
3860 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3861 netdev_get_name(netdev),
3862 tc_get_major(handle), tc_get_minor(handle),
3868 /* Equivalent to "tc qdisc del dev <name> root". */
3870 tc_del_qdisc(struct netdev *netdev)
3872 struct netdev_dev_linux *netdev_dev =
3873 netdev_dev_linux_cast(netdev_get_dev(netdev));
3874 struct ofpbuf request;
3875 struct tcmsg *tcmsg;
3878 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3882 tcmsg->tcm_handle = tc_make_handle(1, 0);
3883 tcmsg->tcm_parent = TC_H_ROOT;
3885 error = tc_transact(&request, NULL);
3886 if (error == EINVAL) {
3887 /* EINVAL probably means that the default qdisc was in use, in which
3888 * case we've accomplished our purpose. */
3891 if (!error && netdev_dev->tc) {
3892 if (netdev_dev->tc->ops->tc_destroy) {
3893 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3895 netdev_dev->tc = NULL;
3900 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3901 * kernel to determine what they are. Returns 0 if successful, otherwise a
3902 * positive errno value. */
3904 tc_query_qdisc(const struct netdev *netdev)
3906 struct netdev_dev_linux *netdev_dev =
3907 netdev_dev_linux_cast(netdev_get_dev(netdev));
3908 struct ofpbuf request, *qdisc;
3909 const struct tc_ops *ops;
3910 struct tcmsg *tcmsg;
3914 if (netdev_dev->tc) {
3918 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3919 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3920 * 2.6.35 without that fix backported to it.
3922 * To avoid the OOPS, we must not make a request that would attempt to dump
3923 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3924 * few others. There are a few ways that I can see to do this, but most of
3925 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3926 * technique chosen here is to assume that any non-default qdisc that we
3927 * create will have a class with handle 1:0. The built-in qdiscs only have
3928 * a class with handle 0:0.
3930 * We could check for Linux 2.6.35+ and use a more straightforward method
3932 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3936 tcmsg->tcm_handle = tc_make_handle(1, 0);
3937 tcmsg->tcm_parent = 0;
3939 /* Figure out what tc class to instantiate. */
3940 error = tc_transact(&request, &qdisc);
3944 error = tc_parse_qdisc(qdisc, &kind, NULL);
3946 ops = &tc_ops_other;
3948 ops = tc_lookup_linux_name(kind);
3950 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3951 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3953 ops = &tc_ops_other;
3956 } else if (error == ENOENT) {
3957 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3958 * other entity that doesn't have a handle 1:0. We will assume
3959 * that it's the system default qdisc. */
3960 ops = &tc_ops_default;
3963 /* Who knows? Maybe the device got deleted. */
3964 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3965 netdev_get_name(netdev), strerror(error));
3966 ops = &tc_ops_other;
3969 /* Instantiate it. */
3970 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3971 assert((load_error == 0) == (netdev_dev->tc != NULL));
3972 ofpbuf_delete(qdisc);
3974 return error ? error : load_error;
3977 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3978 approximate the time to transmit packets of various lengths. For an MTU of
3979 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3980 represents two possible packet lengths; for a MTU of 513 through 1024, four
3981 possible lengths; and so on.
3983 Returns, for the specified 'mtu', the number of bits that packet lengths
3984 need to be shifted right to fit within such a 256-entry table. */
3986 tc_calc_cell_log(unsigned int mtu)
3991 mtu = ETH_PAYLOAD_MAX;
3993 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3995 for (cell_log = 0; mtu >= 256; cell_log++) {
4002 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4005 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4007 memset(rate, 0, sizeof *rate);
4008 rate->cell_log = tc_calc_cell_log(mtu);
4009 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4010 /* rate->cell_align = 0; */ /* distro headers. */
4011 rate->mpu = ETH_TOTAL_MIN;
4015 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4016 * attribute of the specified "type".
4018 * See tc_calc_cell_log() above for a description of "rtab"s. */
4020 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4025 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4026 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4027 unsigned packet_size = (i + 1) << rate->cell_log;
4028 if (packet_size < rate->mpu) {
4029 packet_size = rate->mpu;
4031 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4035 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4036 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4037 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4040 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4042 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4043 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4046 /* Linux-only functions declared in netdev-linux.h */
4048 /* Returns a fd for an AF_INET socket or a negative errno value. */
4050 netdev_linux_get_af_inet_sock(void)
4052 int error = netdev_linux_init();
4053 return error ? -error : af_inet_sock;
4056 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4057 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4059 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4060 const char *flag_name, bool enable)
4062 const char *netdev_name = netdev_get_name(netdev);
4063 struct ethtool_value evalue;
4067 memset(&evalue, 0, sizeof evalue);
4068 error = netdev_linux_do_ethtool(netdev_name,
4069 (struct ethtool_cmd *)&evalue,
4070 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4075 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4076 error = netdev_linux_do_ethtool(netdev_name,
4077 (struct ethtool_cmd *)&evalue,
4078 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4083 memset(&evalue, 0, sizeof evalue);
4084 error = netdev_linux_do_ethtool(netdev_name,
4085 (struct ethtool_cmd *)&evalue,
4086 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4091 if (new_flags != evalue.data) {
4092 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4093 "device %s failed", enable ? "enable" : "disable",
4094 flag_name, netdev_name);
4101 /* Utility functions. */
4103 /* Copies 'src' into 'dst', performing format conversion in the process. */
4105 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4106 const struct rtnl_link_stats *src)
4108 dst->rx_packets = src->rx_packets;
4109 dst->tx_packets = src->tx_packets;
4110 dst->rx_bytes = src->rx_bytes;
4111 dst->tx_bytes = src->tx_bytes;
4112 dst->rx_errors = src->rx_errors;
4113 dst->tx_errors = src->tx_errors;
4114 dst->rx_dropped = src->rx_dropped;
4115 dst->tx_dropped = src->tx_dropped;
4116 dst->multicast = src->multicast;
4117 dst->collisions = src->collisions;
4118 dst->rx_length_errors = src->rx_length_errors;
4119 dst->rx_over_errors = src->rx_over_errors;
4120 dst->rx_crc_errors = src->rx_crc_errors;
4121 dst->rx_frame_errors = src->rx_frame_errors;
4122 dst->rx_fifo_errors = src->rx_fifo_errors;
4123 dst->rx_missed_errors = src->rx_missed_errors;
4124 dst->tx_aborted_errors = src->tx_aborted_errors;
4125 dst->tx_carrier_errors = src->tx_carrier_errors;
4126 dst->tx_fifo_errors = src->tx_fifo_errors;
4127 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4128 dst->tx_window_errors = src->tx_window_errors;
4132 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4134 /* Policy for RTNLGRP_LINK messages.
4136 * There are *many* more fields in these messages, but currently we only
4137 * care about these fields. */
4138 static const struct nl_policy rtnlgrp_link_policy[] = {
4139 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4140 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4141 .min_len = sizeof(struct rtnl_link_stats) },
4144 struct ofpbuf request;
4145 struct ofpbuf *reply;
4146 struct ifinfomsg *ifi;
4147 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4150 ofpbuf_init(&request, 0);
4151 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4152 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4153 ifi->ifi_family = PF_UNSPEC;
4154 ifi->ifi_index = ifindex;
4155 error = nl_sock_transact(rtnl_sock, &request, &reply);
4156 ofpbuf_uninit(&request);
4161 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4162 rtnlgrp_link_policy,
4163 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4164 ofpbuf_delete(reply);
4168 if (!attrs[IFLA_STATS]) {
4169 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4170 ofpbuf_delete(reply);
4174 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4176 ofpbuf_delete(reply);
4182 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4184 static const char fn[] = "/proc/net/dev";
4189 stream = fopen(fn, "r");
4191 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4196 while (fgets(line, sizeof line, stream)) {
4199 #define X64 "%"SCNu64
4202 X64 X64 X64 X64 X64 X64 X64 "%*u"
4203 X64 X64 X64 X64 X64 X64 X64 "%*u",
4209 &stats->rx_fifo_errors,
4210 &stats->rx_frame_errors,
4216 &stats->tx_fifo_errors,
4218 &stats->tx_carrier_errors) != 15) {
4219 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4220 } else if (!strcmp(devname, netdev_name)) {
4221 stats->rx_length_errors = UINT64_MAX;
4222 stats->rx_over_errors = UINT64_MAX;
4223 stats->rx_crc_errors = UINT64_MAX;
4224 stats->rx_missed_errors = UINT64_MAX;
4225 stats->tx_aborted_errors = UINT64_MAX;
4226 stats->tx_heartbeat_errors = UINT64_MAX;
4227 stats->tx_window_errors = UINT64_MAX;
4233 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4239 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4245 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4248 *flags = ifr.ifr_flags;
4254 set_flags(struct netdev *netdev, unsigned int flags)
4258 ifr.ifr_flags = flags;
4259 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4264 do_get_ifindex(const char *netdev_name)
4268 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4269 COVERAGE_INC(netdev_get_ifindex);
4270 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4271 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4272 netdev_name, strerror(errno));
4275 return ifr.ifr_ifindex;
4279 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4281 struct netdev_dev_linux *netdev_dev =
4282 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4284 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4285 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4289 netdev_dev->cache_valid |= VALID_IFINDEX;
4290 netdev_dev->ifindex = ifindex;
4292 *ifindexp = netdev_dev->ifindex;
4297 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4302 memset(&ifr, 0, sizeof ifr);
4303 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4304 COVERAGE_INC(netdev_get_hwaddr);
4305 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4306 /* ENODEV probably means that a vif disappeared asynchronously and
4307 * hasn't been removed from the database yet, so reduce the log level
4308 * to INFO for that case. */
4309 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4310 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4311 netdev_name, strerror(errno));
4314 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4315 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4316 VLOG_WARN("%s device has unknown hardware address family %d",
4317 netdev_name, hwaddr_family);
4319 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4324 set_etheraddr(const char *netdev_name, int hwaddr_family,
4325 const uint8_t mac[ETH_ADDR_LEN])
4329 memset(&ifr, 0, sizeof ifr);
4330 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4331 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4332 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4333 COVERAGE_INC(netdev_set_hwaddr);
4334 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4335 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4336 netdev_name, strerror(errno));
4343 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4344 int cmd, const char *cmd_name)
4348 memset(&ifr, 0, sizeof ifr);
4349 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4350 ifr.ifr_data = (caddr_t) ecmd;
4353 COVERAGE_INC(netdev_ethtool);
4354 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4357 if (errno != EOPNOTSUPP) {
4358 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4359 "failed: %s", cmd_name, name, strerror(errno));
4361 /* The device doesn't support this operation. That's pretty
4362 * common, so there's no point in logging anything. */
4369 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4370 const char *cmd_name)
4372 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4373 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4374 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4382 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4383 int cmd, const char *cmd_name)
4388 ifr.ifr_addr.sa_family = AF_INET;
4389 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4391 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4392 *ip = sin->sin_addr;
4397 /* Returns an AF_PACKET raw socket or a negative errno value. */
4399 af_packet_sock(void)
4401 static int sock = INT_MIN;
4403 if (sock == INT_MIN) {
4404 sock = socket(AF_PACKET, SOCK_RAW, 0);
4406 set_nonblocking(sock);
4409 VLOG_ERR("failed to create packet socket: %s", strerror(errno));