2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_POLICING = 1 << 5,
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct shash *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
332 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
336 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341 static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344 static int tc_delete_class(const struct netdev *, unsigned int handle);
346 static int tc_del_qdisc(struct netdev *netdev);
347 static int tc_query_qdisc(const struct netdev *netdev);
349 static int tc_calc_cell_log(unsigned int mtu);
350 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355 struct netdev_dev_linux {
356 struct netdev_dev netdev_dev;
358 struct shash_node *shash_node;
359 unsigned int cache_valid;
360 unsigned int change_seq;
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
369 uint8_t etheraddr[ETH_ADDR_LEN];
370 struct in_addr address, netmask;
373 unsigned int ifi_flags;
374 long long int carrier_resets;
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
379 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
383 struct tap_state tap;
387 struct netdev_linux {
388 struct netdev netdev;
392 /* Sockets used for ioctl operations. */
393 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
395 /* A Netlink routing socket that is not subscribed to any multicast groups. */
396 static struct nl_sock *rtnl_sock;
398 /* This is set pretty low because we probably won't learn anything from the
399 * additional log messages. */
400 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
402 static int netdev_linux_init(void);
404 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
405 int cmd, const char *cmd_name);
406 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
407 const char *cmd_name);
408 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
409 int cmd, const char *cmd_name);
410 static int get_flags(const struct netdev_dev *, unsigned int *flags);
411 static int set_flags(struct netdev *, unsigned int flags);
412 static int do_get_ifindex(const char *netdev_name);
413 static int get_ifindex(const struct netdev *, int *ifindexp);
414 static int do_set_addr(struct netdev *netdev,
415 int ioctl_nr, const char *ioctl_name,
416 struct in_addr addr);
417 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
418 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
419 const uint8_t[ETH_ADDR_LEN]);
420 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
421 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
422 static int af_packet_sock(void);
423 static void netdev_linux_miimon_run(void);
424 static void netdev_linux_miimon_wait(void);
427 is_netdev_linux_class(const struct netdev_class *netdev_class)
429 return netdev_class->init == netdev_linux_init;
432 static struct netdev_dev_linux *
433 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
435 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
436 assert(is_netdev_linux_class(netdev_class));
438 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
441 static struct netdev_linux *
442 netdev_linux_cast(const struct netdev *netdev)
444 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
445 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
446 assert(is_netdev_linux_class(netdev_class));
448 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
452 netdev_linux_init(void)
454 static int status = -1;
456 /* Create AF_INET socket. */
457 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
458 status = af_inet_sock >= 0 ? 0 : errno;
460 VLOG_ERR("failed to create inet socket: %s", strerror(status));
463 /* Create rtnetlink socket. */
465 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
467 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
476 netdev_linux_run(void)
478 rtnetlink_link_run();
479 netdev_linux_miimon_run();
483 netdev_linux_wait(void)
485 rtnetlink_link_wait();
486 netdev_linux_miimon_wait();
490 netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
495 if (netdev_dev->cache_valid & VALID_DRVINFO) {
499 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
500 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
501 (struct ethtool_cmd *)&netdev_dev->drvinfo,
505 netdev_dev->cache_valid |= VALID_DRVINFO;
511 netdev_dev_linux_changed(struct netdev_dev_linux *dev,
512 unsigned int ifi_flags,
516 if (!dev->change_seq) {
520 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
521 dev->carrier_resets++;
523 dev->ifi_flags = ifi_flags;
525 dev->cache_valid &= mask;
529 netdev_dev_linux_update(struct netdev_dev_linux *dev,
530 const struct rtnetlink_link_change *change)
532 if (change->nlmsg_type == RTM_NEWLINK) {
534 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
536 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
541 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
542 void *aux OVS_UNUSED)
544 struct netdev_dev_linux *dev;
546 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
548 const struct netdev_class *netdev_class =
549 netdev_dev_get_class(base_dev);
551 if (is_netdev_linux_class(netdev_class)) {
552 dev = netdev_dev_linux_cast(base_dev);
553 netdev_dev_linux_update(dev, change);
557 struct shash device_shash;
558 struct shash_node *node;
560 shash_init(&device_shash);
561 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
562 SHASH_FOR_EACH (node, &device_shash) {
567 get_flags(&dev->netdev_dev, &flags);
568 netdev_dev_linux_changed(dev, flags, 0);
570 shash_destroy(&device_shash);
575 cache_notifier_ref(void)
577 if (!cache_notifier_refcount) {
578 assert(!netdev_linux_cache_notifier);
580 netdev_linux_cache_notifier =
581 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
583 if (!netdev_linux_cache_notifier) {
587 cache_notifier_refcount++;
593 cache_notifier_unref(void)
595 assert(cache_notifier_refcount > 0);
596 if (!--cache_notifier_refcount) {
597 assert(netdev_linux_cache_notifier);
598 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
599 netdev_linux_cache_notifier = NULL;
603 /* Creates system and internal devices. */
605 netdev_linux_create(const struct netdev_class *class, const char *name,
606 struct netdev_dev **netdev_devp)
608 struct netdev_dev_linux *netdev_dev;
611 error = cache_notifier_ref();
616 netdev_dev = xzalloc(sizeof *netdev_dev);
617 netdev_dev->change_seq = 1;
618 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
619 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
621 *netdev_devp = &netdev_dev->netdev_dev;
625 /* For most types of netdevs we open the device for each call of
626 * netdev_open(). However, this is not the case with tap devices,
627 * since it is only possible to open the device once. In this
628 * situation we share a single file descriptor, and consequently
629 * buffers, across all readers. Therefore once data is read it will
630 * be unavailable to other reads for tap devices. */
632 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
633 const char *name, struct netdev_dev **netdev_devp)
635 struct netdev_dev_linux *netdev_dev;
636 struct tap_state *state;
637 static const char tap_dev[] = "/dev/net/tun";
641 netdev_dev = xzalloc(sizeof *netdev_dev);
642 state = &netdev_dev->state.tap;
644 error = cache_notifier_ref();
649 /* Open tap device. */
650 state->fd = open(tap_dev, O_RDWR);
653 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
654 goto error_unref_notifier;
657 /* Create tap device. */
658 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
659 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
660 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
661 VLOG_WARN("%s: creating tap device failed: %s", name,
664 goto error_unref_notifier;
667 /* Make non-blocking. */
668 error = set_nonblocking(state->fd);
670 goto error_unref_notifier;
673 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
674 *netdev_devp = &netdev_dev->netdev_dev;
677 error_unref_notifier:
678 cache_notifier_unref();
685 destroy_tap(struct netdev_dev_linux *netdev_dev)
687 struct tap_state *state = &netdev_dev->state.tap;
689 if (state->fd >= 0) {
694 /* Destroys the netdev device 'netdev_dev_'. */
696 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
698 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
699 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
701 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
702 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
705 if (class == &netdev_tap_class) {
706 destroy_tap(netdev_dev);
710 cache_notifier_unref();
714 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
716 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
717 struct netdev_linux *netdev;
718 enum netdev_flags flags;
721 /* Allocate network device. */
722 netdev = xzalloc(sizeof *netdev);
724 netdev_init(&netdev->netdev, netdev_dev_);
726 /* Verify that the device really exists, by attempting to read its flags.
727 * (The flags might be cached, in which case this won't actually do an
730 * Don't do this for "internal" netdevs, though, because those have to be
731 * created as netdev objects before they exist in the kernel, because
732 * creating them in the kernel happens by passing a netdev object to
733 * dpif_port_add(). */
734 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
735 error = netdev_get_flags(&netdev->netdev, &flags);
736 if (error == ENODEV) {
741 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
742 !netdev_dev->state.tap.opened) {
744 /* We assume that the first user of the tap device is the primary user
745 * and give them the tap FD. Subsequent users probably just expect
746 * this to be a system device so open it normally to avoid send/receive
747 * directions appearing to be reversed. */
748 netdev->fd = netdev_dev->state.tap.fd;
749 netdev_dev->state.tap.opened = true;
752 *netdevp = &netdev->netdev;
756 netdev_uninit(&netdev->netdev, true);
760 /* Closes and destroys 'netdev'. */
762 netdev_linux_close(struct netdev *netdev_)
764 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
766 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
773 netdev_linux_listen(struct netdev *netdev_)
775 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
776 struct sockaddr_ll sll;
781 if (netdev->fd >= 0) {
785 /* Create file descriptor. */
786 fd = socket(PF_PACKET, SOCK_RAW, 0);
789 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
793 /* Set non-blocking mode. */
794 error = set_nonblocking(fd);
799 /* Get ethernet device index. */
800 error = get_ifindex(&netdev->netdev, &ifindex);
805 /* Bind to specific ethernet device. */
806 memset(&sll, 0, sizeof sll);
807 sll.sll_family = AF_PACKET;
808 sll.sll_ifindex = ifindex;
809 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
810 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
812 VLOG_ERR("%s: failed to bind raw socket (%s)",
813 netdev_get_name(netdev_), strerror(error));
828 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
830 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
832 if (netdev->fd < 0) {
833 /* Device is not listening. */
840 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
841 ? read(netdev->fd, data, size)
842 : recv(netdev->fd, data, size, MSG_TRUNC));
844 return retval <= size ? retval : -EMSGSIZE;
845 } else if (errno != EINTR) {
846 if (errno != EAGAIN) {
847 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
848 strerror(errno), netdev_get_name(netdev_));
855 /* Registers with the poll loop to wake up from the next call to poll_block()
856 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
858 netdev_linux_recv_wait(struct netdev *netdev_)
860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
861 if (netdev->fd >= 0) {
862 poll_fd_wait(netdev->fd, POLLIN);
866 /* Discards all packets waiting to be received from 'netdev'. */
868 netdev_linux_drain(struct netdev *netdev_)
870 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
871 if (netdev->fd < 0) {
873 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
875 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
876 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
880 drain_fd(netdev->fd, ifr.ifr_qlen);
883 return drain_rcvbuf(netdev->fd);
887 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
888 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
889 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
890 * the packet is too big or too small to transmit on the device.
892 * The caller retains ownership of 'buffer' in all cases.
894 * The kernel maintains a packet transmission queue, so the caller is not
895 * expected to do additional queuing of packets. */
897 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
899 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
903 if (netdev->fd < 0) {
904 /* Use our AF_PACKET socket to send to this device. */
905 struct sockaddr_ll sll;
912 sock = af_packet_sock();
917 error = get_ifindex(netdev_, &ifindex);
922 /* We don't bother setting most fields in sockaddr_ll because the
923 * kernel ignores them for SOCK_RAW. */
924 memset(&sll, 0, sizeof sll);
925 sll.sll_family = AF_PACKET;
926 sll.sll_ifindex = ifindex;
928 iov.iov_base = (void *) data;
932 msg.msg_namelen = sizeof sll;
935 msg.msg_control = NULL;
936 msg.msg_controllen = 0;
939 retval = sendmsg(sock, &msg, 0);
941 /* Use the netdev's own fd to send to this device. This is
942 * essential for tap devices, because packets sent to a tap device
943 * with an AF_PACKET socket will loop back to be *received* again
944 * on the tap device. */
945 retval = write(netdev->fd, data, size);
949 /* The Linux AF_PACKET implementation never blocks waiting for room
950 * for packets, instead returning ENOBUFS. Translate this into
951 * EAGAIN for the caller. */
952 if (errno == ENOBUFS) {
954 } else if (errno == EINTR) {
956 } else if (errno != EAGAIN) {
957 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
958 netdev_get_name(netdev_), strerror(errno));
961 } else if (retval != size) {
962 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
963 "%zu) on %s", retval, size, netdev_get_name(netdev_));
971 /* Registers with the poll loop to wake up from the next call to poll_block()
972 * when the packet transmission queue has sufficient room to transmit a packet
973 * with netdev_send().
975 * The kernel maintains a packet transmission queue, so the client is not
976 * expected to do additional queuing of packets. Thus, this function is
977 * unlikely to ever be used. It is included for completeness. */
979 netdev_linux_send_wait(struct netdev *netdev_)
981 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
982 if (netdev->fd < 0) {
984 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
985 poll_fd_wait(netdev->fd, POLLOUT);
987 /* TAP device always accepts packets.*/
988 poll_immediate_wake();
992 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
993 * otherwise a positive errno value. */
995 netdev_linux_set_etheraddr(struct netdev *netdev_,
996 const uint8_t mac[ETH_ADDR_LEN])
998 struct netdev_dev_linux *netdev_dev =
999 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1002 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
1003 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
1004 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
1006 netdev_dev->cache_valid |= VALID_ETHERADDR;
1007 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
1015 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
1016 * free the returned buffer. */
1018 netdev_linux_get_etheraddr(const struct netdev *netdev_,
1019 uint8_t mac[ETH_ADDR_LEN])
1021 struct netdev_dev_linux *netdev_dev =
1022 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1023 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
1024 int error = get_etheraddr(netdev_get_name(netdev_),
1025 netdev_dev->etheraddr);
1029 netdev_dev->cache_valid |= VALID_ETHERADDR;
1031 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1035 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1036 * in bytes, not including the hardware header; thus, this is typically 1500
1037 * bytes for Ethernet devices. */
1039 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1041 struct netdev_dev_linux *netdev_dev =
1042 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1043 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1047 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1048 SIOCGIFMTU, "SIOCGIFMTU");
1052 netdev_dev->mtu = ifr.ifr_mtu;
1053 netdev_dev->cache_valid |= VALID_MTU;
1055 *mtup = netdev_dev->mtu;
1059 /* Sets the maximum size of transmitted (MTU) for given device using linux
1060 * networking ioctl interface.
1063 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1065 struct netdev_dev_linux *netdev_dev =
1066 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1070 if (netdev_dev->cache_valid & VALID_MTU &&
1071 netdev_dev->mtu == mtu) {
1075 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1076 SIOCSIFMTU, "SIOCSIFMTU");
1081 netdev_dev->mtu = ifr.ifr_mtu;
1082 netdev_dev->cache_valid |= VALID_MTU;
1086 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1087 * On failure, returns a negative errno value. */
1089 netdev_linux_get_ifindex(const struct netdev *netdev)
1093 error = get_ifindex(netdev, &ifindex);
1094 return error ? -error : ifindex;
1098 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1100 struct netdev_dev_linux *netdev_dev =
1101 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1103 if (netdev_dev->miimon_interval > 0) {
1104 *carrier = netdev_dev->miimon;
1106 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1112 static long long int
1113 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1115 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1119 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1120 struct mii_ioctl_data *data)
1125 memset(&ifr, 0, sizeof ifr);
1126 memcpy(&ifr.ifr_data, data, sizeof *data);
1127 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1128 memcpy(data, &ifr.ifr_data, sizeof *data);
1134 netdev_linux_get_miimon(const char *name, bool *miimon)
1136 struct mii_ioctl_data data;
1141 memset(&data, 0, sizeof data);
1142 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1144 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1145 data.reg_num = MII_BMSR;
1146 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1150 *miimon = !!(data.val_out & BMSR_LSTATUS);
1152 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1155 struct ethtool_cmd ecmd;
1157 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1160 memset(&ecmd, 0, sizeof ecmd);
1161 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1164 struct ethtool_value eval;
1166 memcpy(&eval, &ecmd, sizeof eval);
1167 *miimon = !!eval.data;
1169 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1177 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1178 long long int interval)
1180 struct netdev_dev_linux *netdev_dev;
1182 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1184 interval = interval > 0 ? MAX(interval, 100) : 0;
1185 if (netdev_dev->miimon_interval != interval) {
1186 netdev_dev->miimon_interval = interval;
1187 timer_set_expired(&netdev_dev->miimon_timer);
1194 netdev_linux_miimon_run(void)
1196 struct shash device_shash;
1197 struct shash_node *node;
1199 shash_init(&device_shash);
1200 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1201 SHASH_FOR_EACH (node, &device_shash) {
1202 struct netdev_dev_linux *dev = node->data;
1205 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1209 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1210 if (miimon != dev->miimon) {
1211 dev->miimon = miimon;
1212 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1215 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1218 shash_destroy(&device_shash);
1222 netdev_linux_miimon_wait(void)
1224 struct shash device_shash;
1225 struct shash_node *node;
1227 shash_init(&device_shash);
1228 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1229 SHASH_FOR_EACH (node, &device_shash) {
1230 struct netdev_dev_linux *dev = node->data;
1232 if (dev->miimon_interval > 0) {
1233 timer_wait(&dev->miimon_timer);
1236 shash_destroy(&device_shash);
1239 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1240 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1243 check_for_working_netlink_stats(void)
1245 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1246 * preferable, so if that works, we'll use it. */
1247 int ifindex = do_get_ifindex("lo");
1249 VLOG_WARN("failed to get ifindex for lo, "
1250 "obtaining netdev stats from proc");
1253 struct netdev_stats stats;
1254 int error = get_stats_via_netlink(ifindex, &stats);
1256 VLOG_DBG("obtaining netdev stats via rtnetlink");
1259 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1260 "via proc (you are probably running a pre-2.6.19 "
1261 "kernel)", strerror(error));
1268 swap_uint64(uint64_t *a, uint64_t *b)
1276 get_stats_via_vport(const struct netdev *netdev_,
1277 struct netdev_stats *stats)
1279 struct netdev_dev_linux *netdev_dev =
1280 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1282 if (!netdev_dev->vport_stats_error ||
1283 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
1286 error = netdev_vport_get_stats(netdev_, stats);
1288 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1289 "(%s)", netdev_get_name(netdev_), strerror(error));
1291 netdev_dev->vport_stats_error = error;
1292 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
1297 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1298 struct netdev_stats *stats)
1300 static int use_netlink_stats = -1;
1303 if (use_netlink_stats < 0) {
1304 use_netlink_stats = check_for_working_netlink_stats();
1307 if (use_netlink_stats) {
1310 error = get_ifindex(netdev_, &ifindex);
1312 error = get_stats_via_netlink(ifindex, stats);
1315 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1319 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1320 netdev_get_name(netdev_), error);
1326 /* Retrieves current device stats for 'netdev-linux'. */
1328 netdev_linux_get_stats(const struct netdev *netdev_,
1329 struct netdev_stats *stats)
1331 struct netdev_dev_linux *netdev_dev =
1332 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1333 struct netdev_stats dev_stats;
1336 get_stats_via_vport(netdev_, stats);
1338 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1341 if (netdev_dev->vport_stats_error) {
1348 if (netdev_dev->vport_stats_error) {
1349 /* stats not available from OVS then use ioctl stats. */
1352 stats->rx_errors += dev_stats.rx_errors;
1353 stats->tx_errors += dev_stats.tx_errors;
1354 stats->rx_dropped += dev_stats.rx_dropped;
1355 stats->tx_dropped += dev_stats.tx_dropped;
1356 stats->multicast += dev_stats.multicast;
1357 stats->collisions += dev_stats.collisions;
1358 stats->rx_length_errors += dev_stats.rx_length_errors;
1359 stats->rx_over_errors += dev_stats.rx_over_errors;
1360 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1361 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1362 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1363 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1364 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1365 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1366 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1367 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1368 stats->tx_window_errors += dev_stats.tx_window_errors;
1373 /* Retrieves current device stats for 'netdev-tap' netdev or
1374 * netdev-internal. */
1376 netdev_tap_get_stats(const struct netdev *netdev_,
1377 struct netdev_stats *stats)
1379 struct netdev_dev_linux *netdev_dev =
1380 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1381 struct netdev_stats dev_stats;
1384 get_stats_via_vport(netdev_, stats);
1386 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1388 if (netdev_dev->vport_stats_error) {
1395 /* If this port is an internal port then the transmit and receive stats
1396 * will appear to be swapped relative to the other ports since we are the
1397 * one sending the data, not a remote computer. For consistency, we swap
1398 * them back here. This does not apply if we are getting stats from the
1399 * vport layer because it always tracks stats from the perspective of the
1401 if (netdev_dev->vport_stats_error) {
1403 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1404 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1405 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1406 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1407 stats->rx_length_errors = 0;
1408 stats->rx_over_errors = 0;
1409 stats->rx_crc_errors = 0;
1410 stats->rx_frame_errors = 0;
1411 stats->rx_fifo_errors = 0;
1412 stats->rx_missed_errors = 0;
1413 stats->tx_aborted_errors = 0;
1414 stats->tx_carrier_errors = 0;
1415 stats->tx_fifo_errors = 0;
1416 stats->tx_heartbeat_errors = 0;
1417 stats->tx_window_errors = 0;
1419 stats->rx_dropped += dev_stats.tx_dropped;
1420 stats->tx_dropped += dev_stats.rx_dropped;
1422 stats->rx_errors += dev_stats.tx_errors;
1423 stats->tx_errors += dev_stats.rx_errors;
1425 stats->multicast += dev_stats.multicast;
1426 stats->collisions += dev_stats.collisions;
1432 netdev_internal_get_stats(const struct netdev *netdev_,
1433 struct netdev_stats *stats)
1435 struct netdev_dev_linux *netdev_dev =
1436 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1438 get_stats_via_vport(netdev_, stats);
1439 return netdev_dev->vport_stats_error;
1442 /* Stores the features supported by 'netdev' into each of '*current',
1443 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1444 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1447 netdev_linux_get_features(const struct netdev *netdev,
1448 enum netdev_features *current,
1449 enum netdev_features *advertised,
1450 enum netdev_features *supported,
1451 enum netdev_features *peer)
1453 struct ethtool_cmd ecmd;
1457 memset(&ecmd, 0, sizeof ecmd);
1458 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1459 ETHTOOL_GSET, "ETHTOOL_GSET");
1464 /* Supported features. */
1466 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1467 *supported |= NETDEV_F_10MB_HD;
1469 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1470 *supported |= NETDEV_F_10MB_FD;
1472 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1473 *supported |= NETDEV_F_100MB_HD;
1475 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1476 *supported |= NETDEV_F_100MB_FD;
1478 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1479 *supported |= NETDEV_F_1GB_HD;
1481 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1482 *supported |= NETDEV_F_1GB_FD;
1484 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1485 *supported |= NETDEV_F_10GB_FD;
1487 if (ecmd.supported & SUPPORTED_TP) {
1488 *supported |= NETDEV_F_COPPER;
1490 if (ecmd.supported & SUPPORTED_FIBRE) {
1491 *supported |= NETDEV_F_FIBER;
1493 if (ecmd.supported & SUPPORTED_Autoneg) {
1494 *supported |= NETDEV_F_AUTONEG;
1496 if (ecmd.supported & SUPPORTED_Pause) {
1497 *supported |= NETDEV_F_PAUSE;
1499 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1500 *supported |= NETDEV_F_PAUSE_ASYM;
1503 /* Advertised features. */
1505 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1506 *advertised |= NETDEV_F_10MB_HD;
1508 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1509 *advertised |= NETDEV_F_10MB_FD;
1511 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1512 *advertised |= NETDEV_F_100MB_HD;
1514 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1515 *advertised |= NETDEV_F_100MB_FD;
1517 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1518 *advertised |= NETDEV_F_1GB_HD;
1520 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1521 *advertised |= NETDEV_F_1GB_FD;
1523 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1524 *advertised |= NETDEV_F_10GB_FD;
1526 if (ecmd.advertising & ADVERTISED_TP) {
1527 *advertised |= NETDEV_F_COPPER;
1529 if (ecmd.advertising & ADVERTISED_FIBRE) {
1530 *advertised |= NETDEV_F_FIBER;
1532 if (ecmd.advertising & ADVERTISED_Autoneg) {
1533 *advertised |= NETDEV_F_AUTONEG;
1535 if (ecmd.advertising & ADVERTISED_Pause) {
1536 *advertised |= NETDEV_F_PAUSE;
1538 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1539 *advertised |= NETDEV_F_PAUSE_ASYM;
1542 /* Current settings. */
1544 if (speed == SPEED_10) {
1545 *current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
1546 } else if (speed == SPEED_100) {
1547 *current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
1548 } else if (speed == SPEED_1000) {
1549 *current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
1550 } else if (speed == SPEED_10000) {
1551 *current = NETDEV_F_10GB_FD;
1552 } else if (speed == 40000) {
1553 *current = NETDEV_F_40GB_FD;
1554 } else if (speed == 100000) {
1555 *current = NETDEV_F_100GB_FD;
1556 } else if (speed == 1000000) {
1557 *current = NETDEV_F_1TB_FD;
1562 if (ecmd.port == PORT_TP) {
1563 *current |= NETDEV_F_COPPER;
1564 } else if (ecmd.port == PORT_FIBRE) {
1565 *current |= NETDEV_F_FIBER;
1569 *current |= NETDEV_F_AUTONEG;
1572 /* Peer advertisements. */
1573 *peer = 0; /* XXX */
1578 /* Set the features advertised by 'netdev' to 'advertise'. */
1580 netdev_linux_set_advertisements(struct netdev *netdev,
1581 enum netdev_features advertise)
1583 struct ethtool_cmd ecmd;
1586 memset(&ecmd, 0, sizeof ecmd);
1587 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1588 ETHTOOL_GSET, "ETHTOOL_GSET");
1593 ecmd.advertising = 0;
1594 if (advertise & NETDEV_F_10MB_HD) {
1595 ecmd.advertising |= ADVERTISED_10baseT_Half;
1597 if (advertise & NETDEV_F_10MB_FD) {
1598 ecmd.advertising |= ADVERTISED_10baseT_Full;
1600 if (advertise & NETDEV_F_100MB_HD) {
1601 ecmd.advertising |= ADVERTISED_100baseT_Half;
1603 if (advertise & NETDEV_F_100MB_FD) {
1604 ecmd.advertising |= ADVERTISED_100baseT_Full;
1606 if (advertise & NETDEV_F_1GB_HD) {
1607 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1609 if (advertise & NETDEV_F_1GB_FD) {
1610 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1612 if (advertise & NETDEV_F_10GB_FD) {
1613 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1615 if (advertise & NETDEV_F_COPPER) {
1616 ecmd.advertising |= ADVERTISED_TP;
1618 if (advertise & NETDEV_F_FIBER) {
1619 ecmd.advertising |= ADVERTISED_FIBRE;
1621 if (advertise & NETDEV_F_AUTONEG) {
1622 ecmd.advertising |= ADVERTISED_Autoneg;
1624 if (advertise & NETDEV_F_PAUSE) {
1625 ecmd.advertising |= ADVERTISED_Pause;
1627 if (advertise & NETDEV_F_PAUSE_ASYM) {
1628 ecmd.advertising |= ADVERTISED_Asym_Pause;
1630 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1631 ETHTOOL_SSET, "ETHTOOL_SSET");
1634 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1635 * successful, otherwise a positive errno value. */
1637 netdev_linux_set_policing(struct netdev *netdev,
1638 uint32_t kbits_rate, uint32_t kbits_burst)
1640 struct netdev_dev_linux *netdev_dev =
1641 netdev_dev_linux_cast(netdev_get_dev(netdev));
1642 const char *netdev_name = netdev_get_name(netdev);
1646 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1647 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1648 : kbits_burst); /* Stick with user-specified value. */
1650 if (netdev_dev->cache_valid & VALID_POLICING
1651 && netdev_dev->kbits_rate == kbits_rate
1652 && netdev_dev->kbits_burst == kbits_burst) {
1653 /* Assume that settings haven't changed since we last set them. */
1657 COVERAGE_INC(netdev_set_policing);
1658 /* Remove any existing ingress qdisc. */
1659 error = tc_add_del_ingress_qdisc(netdev, false);
1661 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1662 netdev_name, strerror(error));
1667 error = tc_add_del_ingress_qdisc(netdev, true);
1669 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1670 netdev_name, strerror(error));
1674 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1676 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1677 netdev_name, strerror(error));
1682 netdev_dev->kbits_rate = kbits_rate;
1683 netdev_dev->kbits_burst = kbits_burst;
1684 netdev_dev->cache_valid |= VALID_POLICING;
1690 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1693 const struct tc_ops **opsp;
1695 for (opsp = tcs; *opsp != NULL; opsp++) {
1696 const struct tc_ops *ops = *opsp;
1697 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1698 sset_add(types, ops->ovs_name);
1704 static const struct tc_ops *
1705 tc_lookup_ovs_name(const char *name)
1707 const struct tc_ops **opsp;
1709 for (opsp = tcs; *opsp != NULL; opsp++) {
1710 const struct tc_ops *ops = *opsp;
1711 if (!strcmp(name, ops->ovs_name)) {
1718 static const struct tc_ops *
1719 tc_lookup_linux_name(const char *name)
1721 const struct tc_ops **opsp;
1723 for (opsp = tcs; *opsp != NULL; opsp++) {
1724 const struct tc_ops *ops = *opsp;
1725 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1732 static struct tc_queue *
1733 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1736 struct netdev_dev_linux *netdev_dev =
1737 netdev_dev_linux_cast(netdev_get_dev(netdev));
1738 struct tc_queue *queue;
1740 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1741 if (queue->queue_id == queue_id) {
1748 static struct tc_queue *
1749 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1751 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1755 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1757 struct netdev_qos_capabilities *caps)
1759 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1763 caps->n_queues = ops->n_queues;
1768 netdev_linux_get_qos(const struct netdev *netdev,
1769 const char **typep, struct shash *details)
1771 struct netdev_dev_linux *netdev_dev =
1772 netdev_dev_linux_cast(netdev_get_dev(netdev));
1775 error = tc_query_qdisc(netdev);
1780 *typep = netdev_dev->tc->ops->ovs_name;
1781 return (netdev_dev->tc->ops->qdisc_get
1782 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1787 netdev_linux_set_qos(struct netdev *netdev,
1788 const char *type, const struct shash *details)
1790 struct netdev_dev_linux *netdev_dev =
1791 netdev_dev_linux_cast(netdev_get_dev(netdev));
1792 const struct tc_ops *new_ops;
1795 new_ops = tc_lookup_ovs_name(type);
1796 if (!new_ops || !new_ops->tc_install) {
1800 error = tc_query_qdisc(netdev);
1805 if (new_ops == netdev_dev->tc->ops) {
1806 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1808 /* Delete existing qdisc. */
1809 error = tc_del_qdisc(netdev);
1813 assert(netdev_dev->tc == NULL);
1815 /* Install new qdisc. */
1816 error = new_ops->tc_install(netdev, details);
1817 assert((error == 0) == (netdev_dev->tc != NULL));
1824 netdev_linux_get_queue(const struct netdev *netdev,
1825 unsigned int queue_id, struct shash *details)
1827 struct netdev_dev_linux *netdev_dev =
1828 netdev_dev_linux_cast(netdev_get_dev(netdev));
1831 error = tc_query_qdisc(netdev);
1835 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1837 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1843 netdev_linux_set_queue(struct netdev *netdev,
1844 unsigned int queue_id, const struct shash *details)
1846 struct netdev_dev_linux *netdev_dev =
1847 netdev_dev_linux_cast(netdev_get_dev(netdev));
1850 error = tc_query_qdisc(netdev);
1853 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1854 || !netdev_dev->tc->ops->class_set) {
1858 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1862 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1864 struct netdev_dev_linux *netdev_dev =
1865 netdev_dev_linux_cast(netdev_get_dev(netdev));
1868 error = tc_query_qdisc(netdev);
1871 } else if (!netdev_dev->tc->ops->class_delete) {
1874 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1876 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1882 netdev_linux_get_queue_stats(const struct netdev *netdev,
1883 unsigned int queue_id,
1884 struct netdev_queue_stats *stats)
1886 struct netdev_dev_linux *netdev_dev =
1887 netdev_dev_linux_cast(netdev_get_dev(netdev));
1890 error = tc_query_qdisc(netdev);
1893 } else if (!netdev_dev->tc->ops->class_get_stats) {
1896 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1898 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1904 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1906 struct ofpbuf request;
1907 struct tcmsg *tcmsg;
1909 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1913 tcmsg->tcm_parent = 0;
1914 nl_dump_start(dump, rtnl_sock, &request);
1915 ofpbuf_uninit(&request);
1920 netdev_linux_dump_queues(const struct netdev *netdev,
1921 netdev_dump_queues_cb *cb, void *aux)
1923 struct netdev_dev_linux *netdev_dev =
1924 netdev_dev_linux_cast(netdev_get_dev(netdev));
1925 struct tc_queue *queue;
1926 struct shash details;
1930 error = tc_query_qdisc(netdev);
1933 } else if (!netdev_dev->tc->ops->class_get) {
1938 shash_init(&details);
1939 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1940 shash_clear(&details);
1942 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1944 (*cb)(queue->queue_id, &details, aux);
1949 shash_destroy(&details);
1955 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1956 netdev_dump_queue_stats_cb *cb, void *aux)
1958 struct netdev_dev_linux *netdev_dev =
1959 netdev_dev_linux_cast(netdev_get_dev(netdev));
1960 struct nl_dump dump;
1965 error = tc_query_qdisc(netdev);
1968 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1973 if (!start_queue_dump(netdev, &dump)) {
1976 while (nl_dump_next(&dump, &msg)) {
1977 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1983 error = nl_dump_done(&dump);
1984 return error ? error : last_error;
1988 netdev_linux_get_in4(const struct netdev *netdev_,
1989 struct in_addr *address, struct in_addr *netmask)
1991 struct netdev_dev_linux *netdev_dev =
1992 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1994 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1997 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1998 SIOCGIFADDR, "SIOCGIFADDR");
2003 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2004 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2009 netdev_dev->cache_valid |= VALID_IN4;
2011 *address = netdev_dev->address;
2012 *netmask = netdev_dev->netmask;
2013 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2017 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2018 struct in_addr netmask)
2020 struct netdev_dev_linux *netdev_dev =
2021 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2024 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2026 netdev_dev->cache_valid |= VALID_IN4;
2027 netdev_dev->address = address;
2028 netdev_dev->netmask = netmask;
2029 if (address.s_addr != INADDR_ANY) {
2030 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2031 "SIOCSIFNETMASK", netmask);
2038 parse_if_inet6_line(const char *line,
2039 struct in6_addr *in6, char ifname[16 + 1])
2041 uint8_t *s6 = in6->s6_addr;
2042 #define X8 "%2"SCNx8
2044 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2045 "%*x %*x %*x %*x %16s\n",
2046 &s6[0], &s6[1], &s6[2], &s6[3],
2047 &s6[4], &s6[5], &s6[6], &s6[7],
2048 &s6[8], &s6[9], &s6[10], &s6[11],
2049 &s6[12], &s6[13], &s6[14], &s6[15],
2053 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2054 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2056 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2058 struct netdev_dev_linux *netdev_dev =
2059 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2060 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2064 netdev_dev->in6 = in6addr_any;
2066 file = fopen("/proc/net/if_inet6", "r");
2068 const char *name = netdev_get_name(netdev_);
2069 while (fgets(line, sizeof line, file)) {
2070 struct in6_addr in6_tmp;
2071 char ifname[16 + 1];
2072 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2073 && !strcmp(name, ifname))
2075 netdev_dev->in6 = in6_tmp;
2081 netdev_dev->cache_valid |= VALID_IN6;
2083 *in6 = netdev_dev->in6;
2088 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2090 struct sockaddr_in sin;
2091 memset(&sin, 0, sizeof sin);
2092 sin.sin_family = AF_INET;
2093 sin.sin_addr = addr;
2096 memset(sa, 0, sizeof *sa);
2097 memcpy(sa, &sin, sizeof sin);
2101 do_set_addr(struct netdev *netdev,
2102 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2105 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2106 make_in4_sockaddr(&ifr.ifr_addr, addr);
2108 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2112 /* Adds 'router' as a default IP gateway. */
2114 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2116 struct in_addr any = { INADDR_ANY };
2120 memset(&rt, 0, sizeof rt);
2121 make_in4_sockaddr(&rt.rt_dst, any);
2122 make_in4_sockaddr(&rt.rt_gateway, router);
2123 make_in4_sockaddr(&rt.rt_genmask, any);
2124 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2125 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2127 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2133 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2136 static const char fn[] = "/proc/net/route";
2141 *netdev_name = NULL;
2142 stream = fopen(fn, "r");
2143 if (stream == NULL) {
2144 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2149 while (fgets(line, sizeof line, stream)) {
2152 ovs_be32 dest, gateway, mask;
2153 int refcnt, metric, mtu;
2154 unsigned int flags, use, window, irtt;
2157 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2159 iface, &dest, &gateway, &flags, &refcnt,
2160 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2162 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2166 if (!(flags & RTF_UP)) {
2167 /* Skip routes that aren't up. */
2171 /* The output of 'dest', 'mask', and 'gateway' were given in
2172 * network byte order, so we don't need need any endian
2173 * conversions here. */
2174 if ((dest & mask) == (host->s_addr & mask)) {
2176 /* The host is directly reachable. */
2177 next_hop->s_addr = 0;
2179 /* To reach the host, we must go through a gateway. */
2180 next_hop->s_addr = gateway;
2182 *netdev_name = xstrdup(iface);
2194 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2197 struct netdev_dev_linux *netdev_dev =
2198 netdev_dev_linux_cast(netdev_get_dev(netdev));
2200 error = netdev_linux_get_drvinfo(netdev_dev);
2202 shash_add(sh, "driver_name", xstrdup(netdev_dev->drvinfo.driver));
2203 shash_add(sh, "driver_version", xstrdup(netdev_dev->drvinfo.version));
2204 shash_add(sh, "firmware_version", xstrdup(netdev_dev->drvinfo.fw_version));
2210 netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED, struct shash *sh)
2212 shash_add(sh, "driver_name", xstrdup("openvswitch"));
2216 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2217 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2218 * returns 0. Otherwise, it returns a positive errno value; in particular,
2219 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2221 netdev_linux_arp_lookup(const struct netdev *netdev,
2222 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2225 struct sockaddr_in sin;
2228 memset(&r, 0, sizeof r);
2229 memset(&sin, 0, sizeof sin);
2230 sin.sin_family = AF_INET;
2231 sin.sin_addr.s_addr = ip;
2233 memcpy(&r.arp_pa, &sin, sizeof sin);
2234 r.arp_ha.sa_family = ARPHRD_ETHER;
2236 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2237 COVERAGE_INC(netdev_arp_lookup);
2238 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2240 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2241 } else if (retval != ENXIO) {
2242 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2243 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2249 nd_to_iff_flags(enum netdev_flags nd)
2252 if (nd & NETDEV_UP) {
2255 if (nd & NETDEV_PROMISC) {
2262 iff_to_nd_flags(int iff)
2264 enum netdev_flags nd = 0;
2268 if (iff & IFF_PROMISC) {
2269 nd |= NETDEV_PROMISC;
2275 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2276 enum netdev_flags on, enum netdev_flags *old_flagsp)
2278 struct netdev_dev_linux *netdev_dev;
2279 int old_flags, new_flags;
2282 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2283 old_flags = netdev_dev->ifi_flags;
2284 *old_flagsp = iff_to_nd_flags(old_flags);
2285 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2286 if (new_flags != old_flags) {
2287 error = set_flags(netdev, new_flags);
2288 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2294 netdev_linux_change_seq(const struct netdev *netdev)
2296 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2299 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
2304 netdev_linux_init, \
2306 netdev_linux_wait, \
2309 netdev_linux_destroy, \
2310 NULL, /* get_config */ \
2311 NULL, /* set_config */ \
2313 netdev_linux_open, \
2314 netdev_linux_close, \
2316 netdev_linux_listen, \
2317 netdev_linux_recv, \
2318 netdev_linux_recv_wait, \
2319 netdev_linux_drain, \
2321 netdev_linux_send, \
2322 netdev_linux_send_wait, \
2324 netdev_linux_set_etheraddr, \
2325 netdev_linux_get_etheraddr, \
2326 netdev_linux_get_mtu, \
2327 netdev_linux_set_mtu, \
2328 netdev_linux_get_ifindex, \
2329 netdev_linux_get_carrier, \
2330 netdev_linux_get_carrier_resets, \
2331 netdev_linux_set_miimon_interval, \
2335 netdev_linux_get_features, \
2336 netdev_linux_set_advertisements, \
2338 netdev_linux_set_policing, \
2339 netdev_linux_get_qos_types, \
2340 netdev_linux_get_qos_capabilities, \
2341 netdev_linux_get_qos, \
2342 netdev_linux_set_qos, \
2343 netdev_linux_get_queue, \
2344 netdev_linux_set_queue, \
2345 netdev_linux_delete_queue, \
2346 netdev_linux_get_queue_stats, \
2347 netdev_linux_dump_queues, \
2348 netdev_linux_dump_queue_stats, \
2350 netdev_linux_get_in4, \
2351 netdev_linux_set_in4, \
2352 netdev_linux_get_in6, \
2353 netdev_linux_add_router, \
2354 netdev_linux_get_next_hop, \
2356 netdev_linux_arp_lookup, \
2358 netdev_linux_update_flags, \
2360 netdev_linux_change_seq \
2363 const struct netdev_class netdev_linux_class =
2366 netdev_linux_create,
2367 netdev_linux_get_stats,
2368 NULL, /* set_stats */
2369 netdev_linux_get_status);
2371 const struct netdev_class netdev_tap_class =
2374 netdev_linux_create_tap,
2375 netdev_tap_get_stats,
2376 NULL, /* set_stats */
2377 netdev_linux_get_status);
2379 const struct netdev_class netdev_internal_class =
2382 netdev_linux_create,
2383 netdev_internal_get_stats,
2384 netdev_vport_set_stats,
2385 netdev_internal_get_status);
2387 /* HTB traffic control class. */
2389 #define HTB_N_QUEUES 0xf000
2393 unsigned int max_rate; /* In bytes/s. */
2397 struct tc_queue tc_queue;
2398 unsigned int min_rate; /* In bytes/s. */
2399 unsigned int max_rate; /* In bytes/s. */
2400 unsigned int burst; /* In bytes. */
2401 unsigned int priority; /* Lower values are higher priorities. */
2405 htb_get__(const struct netdev *netdev)
2407 struct netdev_dev_linux *netdev_dev =
2408 netdev_dev_linux_cast(netdev_get_dev(netdev));
2409 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2413 htb_install__(struct netdev *netdev, uint64_t max_rate)
2415 struct netdev_dev_linux *netdev_dev =
2416 netdev_dev_linux_cast(netdev_get_dev(netdev));
2419 htb = xmalloc(sizeof *htb);
2420 tc_init(&htb->tc, &tc_ops_htb);
2421 htb->max_rate = max_rate;
2423 netdev_dev->tc = &htb->tc;
2426 /* Create an HTB qdisc.
2428 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2430 htb_setup_qdisc__(struct netdev *netdev)
2433 struct tc_htb_glob opt;
2434 struct ofpbuf request;
2435 struct tcmsg *tcmsg;
2437 tc_del_qdisc(netdev);
2439 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2440 NLM_F_EXCL | NLM_F_CREATE, &request);
2444 tcmsg->tcm_handle = tc_make_handle(1, 0);
2445 tcmsg->tcm_parent = TC_H_ROOT;
2447 nl_msg_put_string(&request, TCA_KIND, "htb");
2449 memset(&opt, 0, sizeof opt);
2450 opt.rate2quantum = 10;
2454 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2455 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2456 nl_msg_end_nested(&request, opt_offset);
2458 return tc_transact(&request, NULL);
2461 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2462 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2464 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2465 unsigned int parent, struct htb_class *class)
2468 struct tc_htb_opt opt;
2469 struct ofpbuf request;
2470 struct tcmsg *tcmsg;
2474 error = netdev_get_mtu(netdev, &mtu);
2476 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2477 netdev_get_name(netdev));
2481 memset(&opt, 0, sizeof opt);
2482 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2483 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2484 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2485 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2486 opt.prio = class->priority;
2488 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2492 tcmsg->tcm_handle = handle;
2493 tcmsg->tcm_parent = parent;
2495 nl_msg_put_string(&request, TCA_KIND, "htb");
2496 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2497 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2498 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2499 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2500 nl_msg_end_nested(&request, opt_offset);
2502 error = tc_transact(&request, NULL);
2504 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2505 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2506 netdev_get_name(netdev),
2507 tc_get_major(handle), tc_get_minor(handle),
2508 tc_get_major(parent), tc_get_minor(parent),
2509 class->min_rate, class->max_rate,
2510 class->burst, class->priority, strerror(error));
2515 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2516 * description of them into 'details'. The description complies with the
2517 * specification given in the vswitch database documentation for linux-htb
2520 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2522 static const struct nl_policy tca_htb_policy[] = {
2523 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2524 .min_len = sizeof(struct tc_htb_opt) },
2527 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2528 const struct tc_htb_opt *htb;
2530 if (!nl_parse_nested(nl_options, tca_htb_policy,
2531 attrs, ARRAY_SIZE(tca_htb_policy))) {
2532 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2536 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2537 class->min_rate = htb->rate.rate;
2538 class->max_rate = htb->ceil.rate;
2539 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2540 class->priority = htb->prio;
2545 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2546 struct htb_class *options,
2547 struct netdev_queue_stats *stats)
2549 struct nlattr *nl_options;
2550 unsigned int handle;
2553 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2554 if (!error && queue_id) {
2555 unsigned int major = tc_get_major(handle);
2556 unsigned int minor = tc_get_minor(handle);
2557 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2558 *queue_id = minor - 1;
2563 if (!error && options) {
2564 error = htb_parse_tca_options__(nl_options, options);
2570 htb_parse_qdisc_details__(struct netdev *netdev,
2571 const struct shash *details, struct htb_class *hc)
2573 const char *max_rate_s;
2575 max_rate_s = shash_find_data(details, "max-rate");
2576 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2577 if (!hc->max_rate) {
2580 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2581 hc->max_rate = netdev_features_to_bps(current) / 8;
2583 hc->min_rate = hc->max_rate;
2589 htb_parse_class_details__(struct netdev *netdev,
2590 const struct shash *details, struct htb_class *hc)
2592 const struct htb *htb = htb_get__(netdev);
2593 const char *min_rate_s = shash_find_data(details, "min-rate");
2594 const char *max_rate_s = shash_find_data(details, "max-rate");
2595 const char *burst_s = shash_find_data(details, "burst");
2596 const char *priority_s = shash_find_data(details, "priority");
2599 error = netdev_get_mtu(netdev, &mtu);
2601 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2602 netdev_get_name(netdev));
2606 /* HTB requires at least an mtu sized min-rate to send any traffic even
2607 * on uncongested links. */
2608 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2609 hc->min_rate = MAX(hc->min_rate, mtu);
2610 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2613 hc->max_rate = (max_rate_s
2614 ? strtoull(max_rate_s, NULL, 10) / 8
2616 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2617 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2621 * According to hints in the documentation that I've read, it is important
2622 * that 'burst' be at least as big as the largest frame that might be
2623 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2624 * but having it a bit too small is a problem. Since netdev_get_mtu()
2625 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2626 * the MTU. We actually add 64, instead of 14, as a guard against
2627 * additional headers get tacked on somewhere that we're not aware of. */
2628 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2629 hc->burst = MAX(hc->burst, mtu + 64);
2632 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2638 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2639 unsigned int parent, struct htb_class *options,
2640 struct netdev_queue_stats *stats)
2642 struct ofpbuf *reply;
2645 error = tc_query_class(netdev, handle, parent, &reply);
2647 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2648 ofpbuf_delete(reply);
2654 htb_tc_install(struct netdev *netdev, const struct shash *details)
2658 error = htb_setup_qdisc__(netdev);
2660 struct htb_class hc;
2662 htb_parse_qdisc_details__(netdev, details, &hc);
2663 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2664 tc_make_handle(1, 0), &hc);
2666 htb_install__(netdev, hc.max_rate);
2672 static struct htb_class *
2673 htb_class_cast__(const struct tc_queue *queue)
2675 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2679 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2680 const struct htb_class *hc)
2682 struct htb *htb = htb_get__(netdev);
2683 size_t hash = hash_int(queue_id, 0);
2684 struct tc_queue *queue;
2685 struct htb_class *hcp;
2687 queue = tc_find_queue__(netdev, queue_id, hash);
2689 hcp = htb_class_cast__(queue);
2691 hcp = xmalloc(sizeof *hcp);
2692 queue = &hcp->tc_queue;
2693 queue->queue_id = queue_id;
2694 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2697 hcp->min_rate = hc->min_rate;
2698 hcp->max_rate = hc->max_rate;
2699 hcp->burst = hc->burst;
2700 hcp->priority = hc->priority;
2704 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2707 struct nl_dump dump;
2708 struct htb_class hc;
2710 /* Get qdisc options. */
2712 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2713 htb_install__(netdev, hc.max_rate);
2716 if (!start_queue_dump(netdev, &dump)) {
2719 while (nl_dump_next(&dump, &msg)) {
2720 unsigned int queue_id;
2722 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2723 htb_update_queue__(netdev, queue_id, &hc);
2726 nl_dump_done(&dump);
2732 htb_tc_destroy(struct tc *tc)
2734 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2735 struct htb_class *hc, *next;
2737 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2738 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2746 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2748 const struct htb *htb = htb_get__(netdev);
2749 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2754 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2756 struct htb_class hc;
2759 htb_parse_qdisc_details__(netdev, details, &hc);
2760 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2761 tc_make_handle(1, 0), &hc);
2763 htb_get__(netdev)->max_rate = hc.max_rate;
2769 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2770 const struct tc_queue *queue, struct shash *details)
2772 const struct htb_class *hc = htb_class_cast__(queue);
2774 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2775 if (hc->min_rate != hc->max_rate) {
2776 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2778 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2780 shash_add(details, "priority", xasprintf("%u", hc->priority));
2786 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2787 const struct shash *details)
2789 struct htb_class hc;
2792 error = htb_parse_class_details__(netdev, details, &hc);
2797 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2798 tc_make_handle(1, 0xfffe), &hc);
2803 htb_update_queue__(netdev, queue_id, &hc);
2808 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2810 struct htb_class *hc = htb_class_cast__(queue);
2811 struct htb *htb = htb_get__(netdev);
2814 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2816 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2823 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2824 struct netdev_queue_stats *stats)
2826 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2827 tc_make_handle(1, 0xfffe), NULL, stats);
2831 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2832 const struct ofpbuf *nlmsg,
2833 netdev_dump_queue_stats_cb *cb, void *aux)
2835 struct netdev_queue_stats stats;
2836 unsigned int handle, major, minor;
2839 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2844 major = tc_get_major(handle);
2845 minor = tc_get_minor(handle);
2846 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2847 (*cb)(minor - 1, &stats, aux);
2852 static const struct tc_ops tc_ops_htb = {
2853 "htb", /* linux_name */
2854 "linux-htb", /* ovs_name */
2855 HTB_N_QUEUES, /* n_queues */
2864 htb_class_get_stats,
2865 htb_class_dump_stats
2868 /* "linux-hfsc" traffic control class. */
2870 #define HFSC_N_QUEUES 0xf000
2878 struct tc_queue tc_queue;
2883 static struct hfsc *
2884 hfsc_get__(const struct netdev *netdev)
2886 struct netdev_dev_linux *netdev_dev;
2887 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2888 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2891 static struct hfsc_class *
2892 hfsc_class_cast__(const struct tc_queue *queue)
2894 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2898 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2900 struct netdev_dev_linux * netdev_dev;
2903 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2904 hfsc = xmalloc(sizeof *hfsc);
2905 tc_init(&hfsc->tc, &tc_ops_hfsc);
2906 hfsc->max_rate = max_rate;
2907 netdev_dev->tc = &hfsc->tc;
2911 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2912 const struct hfsc_class *hc)
2916 struct hfsc_class *hcp;
2917 struct tc_queue *queue;
2919 hfsc = hfsc_get__(netdev);
2920 hash = hash_int(queue_id, 0);
2922 queue = tc_find_queue__(netdev, queue_id, hash);
2924 hcp = hfsc_class_cast__(queue);
2926 hcp = xmalloc(sizeof *hcp);
2927 queue = &hcp->tc_queue;
2928 queue->queue_id = queue_id;
2929 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2932 hcp->min_rate = hc->min_rate;
2933 hcp->max_rate = hc->max_rate;
2937 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2939 const struct tc_service_curve *rsc, *fsc, *usc;
2940 static const struct nl_policy tca_hfsc_policy[] = {
2942 .type = NL_A_UNSPEC,
2944 .min_len = sizeof(struct tc_service_curve),
2947 .type = NL_A_UNSPEC,
2949 .min_len = sizeof(struct tc_service_curve),
2952 .type = NL_A_UNSPEC,
2954 .min_len = sizeof(struct tc_service_curve),
2957 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2959 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2960 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2961 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2965 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2966 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2967 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2969 if (rsc->m1 != 0 || rsc->d != 0 ||
2970 fsc->m1 != 0 || fsc->d != 0 ||
2971 usc->m1 != 0 || usc->d != 0) {
2972 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2973 "Non-linear service curves are not supported.");
2977 if (rsc->m2 != fsc->m2) {
2978 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2979 "Real-time service curves are not supported ");
2983 if (rsc->m2 > usc->m2) {
2984 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2985 "Min-rate service curve is greater than "
2986 "the max-rate service curve.");
2990 class->min_rate = fsc->m2;
2991 class->max_rate = usc->m2;
2996 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2997 struct hfsc_class *options,
2998 struct netdev_queue_stats *stats)
3001 unsigned int handle;
3002 struct nlattr *nl_options;
3004 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3010 unsigned int major, minor;
3012 major = tc_get_major(handle);
3013 minor = tc_get_minor(handle);
3014 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3015 *queue_id = minor - 1;
3022 error = hfsc_parse_tca_options__(nl_options, options);
3029 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3030 unsigned int parent, struct hfsc_class *options,
3031 struct netdev_queue_stats *stats)
3034 struct ofpbuf *reply;
3036 error = tc_query_class(netdev, handle, parent, &reply);
3041 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3042 ofpbuf_delete(reply);
3047 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3048 struct hfsc_class *class)
3051 const char *max_rate_s;
3053 max_rate_s = shash_find_data(details, "max-rate");
3054 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3059 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3060 max_rate = netdev_features_to_bps(current) / 8;
3063 class->min_rate = max_rate;
3064 class->max_rate = max_rate;
3068 hfsc_parse_class_details__(struct netdev *netdev,
3069 const struct shash *details,
3070 struct hfsc_class * class)
3072 const struct hfsc *hfsc;
3073 uint32_t min_rate, max_rate;
3074 const char *min_rate_s, *max_rate_s;
3076 hfsc = hfsc_get__(netdev);
3077 min_rate_s = shash_find_data(details, "min-rate");
3078 max_rate_s = shash_find_data(details, "max-rate");
3080 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3081 min_rate = MAX(min_rate, 1);
3082 min_rate = MIN(min_rate, hfsc->max_rate);
3084 max_rate = (max_rate_s
3085 ? strtoull(max_rate_s, NULL, 10) / 8
3087 max_rate = MAX(max_rate, min_rate);
3088 max_rate = MIN(max_rate, hfsc->max_rate);
3090 class->min_rate = min_rate;
3091 class->max_rate = max_rate;
3096 /* Create an HFSC qdisc.
3098 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3100 hfsc_setup_qdisc__(struct netdev * netdev)
3102 struct tcmsg *tcmsg;
3103 struct ofpbuf request;
3104 struct tc_hfsc_qopt opt;
3106 tc_del_qdisc(netdev);
3108 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3109 NLM_F_EXCL | NLM_F_CREATE, &request);
3115 tcmsg->tcm_handle = tc_make_handle(1, 0);
3116 tcmsg->tcm_parent = TC_H_ROOT;
3118 memset(&opt, 0, sizeof opt);
3121 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3122 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3124 return tc_transact(&request, NULL);
3127 /* Create an HFSC class.
3129 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3130 * sc rate <min_rate> ul rate <max_rate>" */
3132 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3133 unsigned int parent, struct hfsc_class *class)
3137 struct tcmsg *tcmsg;
3138 struct ofpbuf request;
3139 struct tc_service_curve min, max;
3141 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3147 tcmsg->tcm_handle = handle;
3148 tcmsg->tcm_parent = parent;
3152 min.m2 = class->min_rate;
3156 max.m2 = class->max_rate;
3158 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3159 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3160 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3161 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3162 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3163 nl_msg_end_nested(&request, opt_offset);
3165 error = tc_transact(&request, NULL);
3167 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3168 "min-rate %ubps, max-rate %ubps (%s)",
3169 netdev_get_name(netdev),
3170 tc_get_major(handle), tc_get_minor(handle),
3171 tc_get_major(parent), tc_get_minor(parent),
3172 class->min_rate, class->max_rate, strerror(error));
3179 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3182 struct hfsc_class class;
3184 error = hfsc_setup_qdisc__(netdev);
3190 hfsc_parse_qdisc_details__(netdev, details, &class);
3191 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3192 tc_make_handle(1, 0), &class);
3198 hfsc_install__(netdev, class.max_rate);
3203 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3206 struct nl_dump dump;
3207 struct hfsc_class hc;
3210 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3211 hfsc_install__(netdev, hc.max_rate);
3213 if (!start_queue_dump(netdev, &dump)) {
3217 while (nl_dump_next(&dump, &msg)) {
3218 unsigned int queue_id;
3220 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3221 hfsc_update_queue__(netdev, queue_id, &hc);
3225 nl_dump_done(&dump);
3230 hfsc_tc_destroy(struct tc *tc)
3233 struct hfsc_class *hc, *next;
3235 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3237 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3238 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3247 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3249 const struct hfsc *hfsc;
3250 hfsc = hfsc_get__(netdev);
3251 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3256 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3259 struct hfsc_class class;
3261 hfsc_parse_qdisc_details__(netdev, details, &class);
3262 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3263 tc_make_handle(1, 0), &class);
3266 hfsc_get__(netdev)->max_rate = class.max_rate;
3273 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3274 const struct tc_queue *queue, struct shash *details)
3276 const struct hfsc_class *hc;
3278 hc = hfsc_class_cast__(queue);
3279 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3280 if (hc->min_rate != hc->max_rate) {
3281 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3287 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3288 const struct shash *details)
3291 struct hfsc_class class;
3293 error = hfsc_parse_class_details__(netdev, details, &class);
3298 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3299 tc_make_handle(1, 0xfffe), &class);
3304 hfsc_update_queue__(netdev, queue_id, &class);
3309 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3313 struct hfsc_class *hc;
3315 hc = hfsc_class_cast__(queue);
3316 hfsc = hfsc_get__(netdev);
3318 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3320 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3327 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3328 struct netdev_queue_stats *stats)
3330 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3331 tc_make_handle(1, 0xfffe), NULL, stats);
3335 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3336 const struct ofpbuf *nlmsg,
3337 netdev_dump_queue_stats_cb *cb, void *aux)
3339 struct netdev_queue_stats stats;
3340 unsigned int handle, major, minor;
3343 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3348 major = tc_get_major(handle);
3349 minor = tc_get_minor(handle);
3350 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3351 (*cb)(minor - 1, &stats, aux);
3356 static const struct tc_ops tc_ops_hfsc = {
3357 "hfsc", /* linux_name */
3358 "linux-hfsc", /* ovs_name */
3359 HFSC_N_QUEUES, /* n_queues */
3360 hfsc_tc_install, /* tc_install */
3361 hfsc_tc_load, /* tc_load */
3362 hfsc_tc_destroy, /* tc_destroy */
3363 hfsc_qdisc_get, /* qdisc_get */
3364 hfsc_qdisc_set, /* qdisc_set */
3365 hfsc_class_get, /* class_get */
3366 hfsc_class_set, /* class_set */
3367 hfsc_class_delete, /* class_delete */
3368 hfsc_class_get_stats, /* class_get_stats */
3369 hfsc_class_dump_stats /* class_dump_stats */
3372 /* "linux-default" traffic control class.
3374 * This class represents the default, unnamed Linux qdisc. It corresponds to
3375 * the "" (empty string) QoS type in the OVS database. */
3378 default_install__(struct netdev *netdev)
3380 struct netdev_dev_linux *netdev_dev =
3381 netdev_dev_linux_cast(netdev_get_dev(netdev));
3382 static struct tc *tc;
3385 tc = xmalloc(sizeof *tc);
3386 tc_init(tc, &tc_ops_default);
3388 netdev_dev->tc = tc;
3392 default_tc_install(struct netdev *netdev,
3393 const struct shash *details OVS_UNUSED)
3395 default_install__(netdev);
3400 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3402 default_install__(netdev);
3406 static const struct tc_ops tc_ops_default = {
3407 NULL, /* linux_name */
3412 NULL, /* tc_destroy */
3413 NULL, /* qdisc_get */
3414 NULL, /* qdisc_set */
3415 NULL, /* class_get */
3416 NULL, /* class_set */
3417 NULL, /* class_delete */
3418 NULL, /* class_get_stats */
3419 NULL /* class_dump_stats */
3422 /* "linux-other" traffic control class.
3427 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3429 struct netdev_dev_linux *netdev_dev =
3430 netdev_dev_linux_cast(netdev_get_dev(netdev));
3431 static struct tc *tc;
3434 tc = xmalloc(sizeof *tc);
3435 tc_init(tc, &tc_ops_other);
3437 netdev_dev->tc = tc;
3441 static const struct tc_ops tc_ops_other = {
3442 NULL, /* linux_name */
3443 "linux-other", /* ovs_name */
3445 NULL, /* tc_install */
3447 NULL, /* tc_destroy */
3448 NULL, /* qdisc_get */
3449 NULL, /* qdisc_set */
3450 NULL, /* class_get */
3451 NULL, /* class_set */
3452 NULL, /* class_delete */
3453 NULL, /* class_get_stats */
3454 NULL /* class_dump_stats */
3457 /* Traffic control. */
3459 /* Number of kernel "tc" ticks per second. */
3460 static double ticks_per_s;
3462 /* Number of kernel "jiffies" per second. This is used for the purpose of
3463 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3464 * one jiffy's worth of data.
3466 * There are two possibilities here:
3468 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3469 * approximate range of 100 to 1024. That means that we really need to
3470 * make sure that the qdisc can buffer that much data.
3472 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3473 * has finely granular timers and there's no need to fudge additional room
3474 * for buffers. (There's no extra effort needed to implement that: the
3475 * large 'buffer_hz' is used as a divisor, so practically any number will
3476 * come out as 0 in the division. Small integer results in the case of
3477 * really high dividends won't have any real effect anyhow.)
3479 static unsigned int buffer_hz;
3481 /* Returns tc handle 'major':'minor'. */
3483 tc_make_handle(unsigned int major, unsigned int minor)
3485 return TC_H_MAKE(major << 16, minor);
3488 /* Returns the major number from 'handle'. */
3490 tc_get_major(unsigned int handle)
3492 return TC_H_MAJ(handle) >> 16;
3495 /* Returns the minor number from 'handle'. */
3497 tc_get_minor(unsigned int handle)
3499 return TC_H_MIN(handle);
3502 static struct tcmsg *
3503 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3504 struct ofpbuf *request)
3506 struct tcmsg *tcmsg;
3510 error = get_ifindex(netdev, &ifindex);
3515 ofpbuf_init(request, 512);
3516 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3517 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3518 tcmsg->tcm_family = AF_UNSPEC;
3519 tcmsg->tcm_ifindex = ifindex;
3520 /* Caller should fill in tcmsg->tcm_handle. */
3521 /* Caller should fill in tcmsg->tcm_parent. */
3527 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3529 int error = nl_sock_transact(rtnl_sock, request, replyp);
3530 ofpbuf_uninit(request);
3534 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3535 * policing configuration.
3537 * This function is equivalent to running the following when 'add' is true:
3538 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3540 * This function is equivalent to running the following when 'add' is false:
3541 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3543 * The configuration and stats may be seen with the following command:
3544 * /sbin/tc -s qdisc show dev <devname>
3546 * Returns 0 if successful, otherwise a positive errno value.
3549 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3551 struct ofpbuf request;
3552 struct tcmsg *tcmsg;
3554 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3555 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3557 tcmsg = tc_make_request(netdev, type, flags, &request);
3561 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3562 tcmsg->tcm_parent = TC_H_INGRESS;
3563 nl_msg_put_string(&request, TCA_KIND, "ingress");
3564 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3566 error = tc_transact(&request, NULL);
3568 /* If we're deleting the qdisc, don't worry about some of the
3569 * error conditions. */
3570 if (!add && (error == ENOENT || error == EINVAL)) {
3579 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3582 * This function is equivalent to running:
3583 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3584 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3587 * The configuration and stats may be seen with the following command:
3588 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3590 * Returns 0 if successful, otherwise a positive errno value.
3593 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3595 struct tc_police tc_police;
3596 struct ofpbuf request;
3597 struct tcmsg *tcmsg;
3598 size_t basic_offset;
3599 size_t police_offset;
3603 memset(&tc_police, 0, sizeof tc_police);
3604 tc_police.action = TC_POLICE_SHOT;
3605 tc_police.mtu = mtu;
3606 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3607 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3608 kbits_burst * 1024);
3610 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3611 NLM_F_EXCL | NLM_F_CREATE, &request);
3615 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3616 tcmsg->tcm_info = tc_make_handle(49,
3617 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3619 nl_msg_put_string(&request, TCA_KIND, "basic");
3620 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3621 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3622 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3623 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3624 nl_msg_end_nested(&request, police_offset);
3625 nl_msg_end_nested(&request, basic_offset);
3627 error = tc_transact(&request, NULL);
3638 /* The values in psched are not individually very meaningful, but they are
3639 * important. The tables below show some values seen in the wild.
3643 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3644 * (Before that, there are hints that it was 1000000000.)
3646 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3650 * -----------------------------------
3651 * [1] 000c8000 000f4240 000f4240 00000064
3652 * [2] 000003e8 00000400 000f4240 3b9aca00
3653 * [3] 000003e8 00000400 000f4240 3b9aca00
3654 * [4] 000003e8 00000400 000f4240 00000064
3655 * [5] 000003e8 00000040 000f4240 3b9aca00
3656 * [6] 000003e8 00000040 000f4240 000000f9
3658 * a b c d ticks_per_s buffer_hz
3659 * ------- --------- ---------- ------------- ----------- -------------
3660 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3661 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3662 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3663 * [4] 1,000 1,024 1,000,000 100 976,562 100
3664 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3665 * [6] 1,000 64 1,000,000 249 15,625,000 249
3667 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3668 * [2] 2.6.26-1-686-bigmem from Debian lenny
3669 * [3] 2.6.26-2-sparc64 from Debian lenny
3670 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3671 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3672 * [6] 2.6.34 from kernel.org on KVM
3674 static const char fn[] = "/proc/net/psched";
3675 unsigned int a, b, c, d;
3681 stream = fopen(fn, "r");
3683 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3687 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3688 VLOG_WARN("%s: read failed", fn);
3692 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3696 VLOG_WARN("%s: invalid scheduler parameters", fn);
3700 ticks_per_s = (double) a * c / b;
3704 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3707 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3710 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3711 * rate of 'rate' bytes per second. */
3713 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3718 return (rate * ticks) / ticks_per_s;
3721 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3722 * rate of 'rate' bytes per second. */
3724 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3729 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3732 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3733 * a transmission rate of 'rate' bytes per second. */
3735 tc_buffer_per_jiffy(unsigned int rate)
3740 return rate / buffer_hz;
3743 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3744 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3745 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3746 * stores NULL into it if it is absent.
3748 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3751 * Returns 0 if successful, otherwise a positive errno value. */
3753 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3754 struct nlattr **options)
3756 static const struct nl_policy tca_policy[] = {
3757 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3758 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3760 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3762 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3763 tca_policy, ta, ARRAY_SIZE(ta))) {
3764 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3769 *kind = nl_attr_get_string(ta[TCA_KIND]);
3773 *options = ta[TCA_OPTIONS];
3788 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3789 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3790 * into '*options', and its queue statistics into '*stats'. Any of the output
3791 * arguments may be null.
3793 * Returns 0 if successful, otherwise a positive errno value. */
3795 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3796 struct nlattr **options, struct netdev_queue_stats *stats)
3798 static const struct nl_policy tca_policy[] = {
3799 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3800 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3802 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3804 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3805 tca_policy, ta, ARRAY_SIZE(ta))) {
3806 VLOG_WARN_RL(&rl, "failed to parse class message");
3811 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3812 *handlep = tc->tcm_handle;
3816 *options = ta[TCA_OPTIONS];
3820 const struct gnet_stats_queue *gsq;
3821 struct gnet_stats_basic gsb;
3823 static const struct nl_policy stats_policy[] = {
3824 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3825 .min_len = sizeof gsb },
3826 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3827 .min_len = sizeof *gsq },
3829 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3831 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3832 sa, ARRAY_SIZE(sa))) {
3833 VLOG_WARN_RL(&rl, "failed to parse class stats");
3837 /* Alignment issues screw up the length of struct gnet_stats_basic on
3838 * some arch/bitsize combinations. Newer versions of Linux have a
3839 * struct gnet_stats_basic_packed, but we can't depend on that. The
3840 * easiest thing to do is just to make a copy. */
3841 memset(&gsb, 0, sizeof gsb);
3842 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3843 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3844 stats->tx_bytes = gsb.bytes;
3845 stats->tx_packets = gsb.packets;
3847 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3848 stats->tx_errors = gsq->drops;
3858 memset(stats, 0, sizeof *stats);
3863 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3866 tc_query_class(const struct netdev *netdev,
3867 unsigned int handle, unsigned int parent,
3868 struct ofpbuf **replyp)
3870 struct ofpbuf request;
3871 struct tcmsg *tcmsg;
3874 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3878 tcmsg->tcm_handle = handle;
3879 tcmsg->tcm_parent = parent;
3881 error = tc_transact(&request, replyp);
3883 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3884 netdev_get_name(netdev),
3885 tc_get_major(handle), tc_get_minor(handle),
3886 tc_get_major(parent), tc_get_minor(parent),
3892 /* Equivalent to "tc class del dev <name> handle <handle>". */
3894 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3896 struct ofpbuf request;
3897 struct tcmsg *tcmsg;
3900 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3904 tcmsg->tcm_handle = handle;
3905 tcmsg->tcm_parent = 0;
3907 error = tc_transact(&request, NULL);
3909 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3910 netdev_get_name(netdev),
3911 tc_get_major(handle), tc_get_minor(handle),
3917 /* Equivalent to "tc qdisc del dev <name> root". */
3919 tc_del_qdisc(struct netdev *netdev)
3921 struct netdev_dev_linux *netdev_dev =
3922 netdev_dev_linux_cast(netdev_get_dev(netdev));
3923 struct ofpbuf request;
3924 struct tcmsg *tcmsg;
3927 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3931 tcmsg->tcm_handle = tc_make_handle(1, 0);
3932 tcmsg->tcm_parent = TC_H_ROOT;
3934 error = tc_transact(&request, NULL);
3935 if (error == EINVAL) {
3936 /* EINVAL probably means that the default qdisc was in use, in which
3937 * case we've accomplished our purpose. */
3940 if (!error && netdev_dev->tc) {
3941 if (netdev_dev->tc->ops->tc_destroy) {
3942 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3944 netdev_dev->tc = NULL;
3949 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3950 * kernel to determine what they are. Returns 0 if successful, otherwise a
3951 * positive errno value. */
3953 tc_query_qdisc(const struct netdev *netdev)
3955 struct netdev_dev_linux *netdev_dev =
3956 netdev_dev_linux_cast(netdev_get_dev(netdev));
3957 struct ofpbuf request, *qdisc;
3958 const struct tc_ops *ops;
3959 struct tcmsg *tcmsg;
3963 if (netdev_dev->tc) {
3967 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3968 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3969 * 2.6.35 without that fix backported to it.
3971 * To avoid the OOPS, we must not make a request that would attempt to dump
3972 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3973 * few others. There are a few ways that I can see to do this, but most of
3974 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3975 * technique chosen here is to assume that any non-default qdisc that we
3976 * create will have a class with handle 1:0. The built-in qdiscs only have
3977 * a class with handle 0:0.
3979 * We could check for Linux 2.6.35+ and use a more straightforward method
3981 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3985 tcmsg->tcm_handle = tc_make_handle(1, 0);
3986 tcmsg->tcm_parent = 0;
3988 /* Figure out what tc class to instantiate. */
3989 error = tc_transact(&request, &qdisc);
3993 error = tc_parse_qdisc(qdisc, &kind, NULL);
3995 ops = &tc_ops_other;
3997 ops = tc_lookup_linux_name(kind);
3999 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4000 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4002 ops = &tc_ops_other;
4005 } else if (error == ENOENT) {
4006 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4007 * other entity that doesn't have a handle 1:0. We will assume
4008 * that it's the system default qdisc. */
4009 ops = &tc_ops_default;
4012 /* Who knows? Maybe the device got deleted. */
4013 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4014 netdev_get_name(netdev), strerror(error));
4015 ops = &tc_ops_other;
4018 /* Instantiate it. */
4019 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
4020 assert((load_error == 0) == (netdev_dev->tc != NULL));
4021 ofpbuf_delete(qdisc);
4023 return error ? error : load_error;
4026 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4027 approximate the time to transmit packets of various lengths. For an MTU of
4028 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4029 represents two possible packet lengths; for a MTU of 513 through 1024, four
4030 possible lengths; and so on.
4032 Returns, for the specified 'mtu', the number of bits that packet lengths
4033 need to be shifted right to fit within such a 256-entry table. */
4035 tc_calc_cell_log(unsigned int mtu)
4040 mtu = ETH_PAYLOAD_MAX;
4042 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4044 for (cell_log = 0; mtu >= 256; cell_log++) {
4051 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4054 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4056 memset(rate, 0, sizeof *rate);
4057 rate->cell_log = tc_calc_cell_log(mtu);
4058 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4059 /* rate->cell_align = 0; */ /* distro headers. */
4060 rate->mpu = ETH_TOTAL_MIN;
4064 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4065 * attribute of the specified "type".
4067 * See tc_calc_cell_log() above for a description of "rtab"s. */
4069 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4074 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4075 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4076 unsigned packet_size = (i + 1) << rate->cell_log;
4077 if (packet_size < rate->mpu) {
4078 packet_size = rate->mpu;
4080 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4084 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4085 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4086 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4089 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4091 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4092 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4095 /* Linux-only functions declared in netdev-linux.h */
4097 /* Returns a fd for an AF_INET socket or a negative errno value. */
4099 netdev_linux_get_af_inet_sock(void)
4101 int error = netdev_linux_init();
4102 return error ? -error : af_inet_sock;
4105 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4106 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4108 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4109 const char *flag_name, bool enable)
4111 const char *netdev_name = netdev_get_name(netdev);
4112 struct ethtool_value evalue;
4116 memset(&evalue, 0, sizeof evalue);
4117 error = netdev_linux_do_ethtool(netdev_name,
4118 (struct ethtool_cmd *)&evalue,
4119 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4124 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4125 error = netdev_linux_do_ethtool(netdev_name,
4126 (struct ethtool_cmd *)&evalue,
4127 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4132 memset(&evalue, 0, sizeof evalue);
4133 error = netdev_linux_do_ethtool(netdev_name,
4134 (struct ethtool_cmd *)&evalue,
4135 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4140 if (new_flags != evalue.data) {
4141 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4142 "device %s failed", enable ? "enable" : "disable",
4143 flag_name, netdev_name);
4150 /* Utility functions. */
4152 /* Copies 'src' into 'dst', performing format conversion in the process. */
4154 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4155 const struct rtnl_link_stats *src)
4157 dst->rx_packets = src->rx_packets;
4158 dst->tx_packets = src->tx_packets;
4159 dst->rx_bytes = src->rx_bytes;
4160 dst->tx_bytes = src->tx_bytes;
4161 dst->rx_errors = src->rx_errors;
4162 dst->tx_errors = src->tx_errors;
4163 dst->rx_dropped = src->rx_dropped;
4164 dst->tx_dropped = src->tx_dropped;
4165 dst->multicast = src->multicast;
4166 dst->collisions = src->collisions;
4167 dst->rx_length_errors = src->rx_length_errors;
4168 dst->rx_over_errors = src->rx_over_errors;
4169 dst->rx_crc_errors = src->rx_crc_errors;
4170 dst->rx_frame_errors = src->rx_frame_errors;
4171 dst->rx_fifo_errors = src->rx_fifo_errors;
4172 dst->rx_missed_errors = src->rx_missed_errors;
4173 dst->tx_aborted_errors = src->tx_aborted_errors;
4174 dst->tx_carrier_errors = src->tx_carrier_errors;
4175 dst->tx_fifo_errors = src->tx_fifo_errors;
4176 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4177 dst->tx_window_errors = src->tx_window_errors;
4181 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4183 /* Policy for RTNLGRP_LINK messages.
4185 * There are *many* more fields in these messages, but currently we only
4186 * care about these fields. */
4187 static const struct nl_policy rtnlgrp_link_policy[] = {
4188 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4189 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4190 .min_len = sizeof(struct rtnl_link_stats) },
4193 struct ofpbuf request;
4194 struct ofpbuf *reply;
4195 struct ifinfomsg *ifi;
4196 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4199 ofpbuf_init(&request, 0);
4200 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4201 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4202 ifi->ifi_family = PF_UNSPEC;
4203 ifi->ifi_index = ifindex;
4204 error = nl_sock_transact(rtnl_sock, &request, &reply);
4205 ofpbuf_uninit(&request);
4210 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4211 rtnlgrp_link_policy,
4212 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4213 ofpbuf_delete(reply);
4217 if (!attrs[IFLA_STATS]) {
4218 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4219 ofpbuf_delete(reply);
4223 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4225 ofpbuf_delete(reply);
4231 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4233 static const char fn[] = "/proc/net/dev";
4238 stream = fopen(fn, "r");
4240 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4245 while (fgets(line, sizeof line, stream)) {
4248 #define X64 "%"SCNu64
4251 X64 X64 X64 X64 X64 X64 X64 "%*u"
4252 X64 X64 X64 X64 X64 X64 X64 "%*u",
4258 &stats->rx_fifo_errors,
4259 &stats->rx_frame_errors,
4265 &stats->tx_fifo_errors,
4267 &stats->tx_carrier_errors) != 15) {
4268 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4269 } else if (!strcmp(devname, netdev_name)) {
4270 stats->rx_length_errors = UINT64_MAX;
4271 stats->rx_over_errors = UINT64_MAX;
4272 stats->rx_crc_errors = UINT64_MAX;
4273 stats->rx_missed_errors = UINT64_MAX;
4274 stats->tx_aborted_errors = UINT64_MAX;
4275 stats->tx_heartbeat_errors = UINT64_MAX;
4276 stats->tx_window_errors = UINT64_MAX;
4282 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4288 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4294 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4297 *flags = ifr.ifr_flags;
4303 set_flags(struct netdev *netdev, unsigned int flags)
4307 ifr.ifr_flags = flags;
4308 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4313 do_get_ifindex(const char *netdev_name)
4317 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4318 COVERAGE_INC(netdev_get_ifindex);
4319 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4320 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4321 netdev_name, strerror(errno));
4324 return ifr.ifr_ifindex;
4328 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4330 struct netdev_dev_linux *netdev_dev =
4331 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4333 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4334 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4338 netdev_dev->cache_valid |= VALID_IFINDEX;
4339 netdev_dev->ifindex = ifindex;
4341 *ifindexp = netdev_dev->ifindex;
4346 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4351 memset(&ifr, 0, sizeof ifr);
4352 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4353 COVERAGE_INC(netdev_get_hwaddr);
4354 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4355 /* ENODEV probably means that a vif disappeared asynchronously and
4356 * hasn't been removed from the database yet, so reduce the log level
4357 * to INFO for that case. */
4358 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4359 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4360 netdev_name, strerror(errno));
4363 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4364 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4365 VLOG_WARN("%s device has unknown hardware address family %d",
4366 netdev_name, hwaddr_family);
4368 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4373 set_etheraddr(const char *netdev_name, int hwaddr_family,
4374 const uint8_t mac[ETH_ADDR_LEN])
4378 memset(&ifr, 0, sizeof ifr);
4379 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4380 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4381 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4382 COVERAGE_INC(netdev_set_hwaddr);
4383 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4384 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4385 netdev_name, strerror(errno));
4392 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4393 int cmd, const char *cmd_name)
4397 memset(&ifr, 0, sizeof ifr);
4398 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4399 ifr.ifr_data = (caddr_t) ecmd;
4402 COVERAGE_INC(netdev_ethtool);
4403 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4406 if (errno != EOPNOTSUPP) {
4407 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4408 "failed: %s", cmd_name, name, strerror(errno));
4410 /* The device doesn't support this operation. That's pretty
4411 * common, so there's no point in logging anything. */
4418 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4419 const char *cmd_name)
4421 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4422 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4423 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4431 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4432 int cmd, const char *cmd_name)
4437 ifr.ifr_addr.sa_family = AF_INET;
4438 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4440 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4441 *ip = sin->sin_addr;
4446 /* Returns an AF_PACKET raw socket or a negative errno value. */
4448 af_packet_sock(void)
4450 static int sock = INT_MIN;
4452 if (sock == INT_MIN) {
4453 sock = socket(AF_PACKET, SOCK_RAW, 0);
4455 set_nonblocking(sock);
4458 VLOG_ERR("failed to create packet socket: %s", strerror(errno));