2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95 #ifndef ETHTOOL_GFLAGS
96 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
98 #ifndef ETHTOOL_SFLAGS
99 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
102 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
105 #define TC_RTAB_SIZE 1024
108 static struct nln_notifier netdev_linux_cache_notifier;
109 static int cache_notifier_refcount;
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
117 VALID_CARRIER = 1 << 5,
118 VALID_POLICING = 1 << 6,
119 VALID_HAVE_VPORT_STATS = 1 << 7
127 /* Traffic control. */
129 /* An instance of a traffic control class. Always associated with a particular
132 * Each TC implementation subclasses this with whatever additional data it
135 const struct tc_ops *ops;
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
141 /* One traffic control queue.
143 * Each TC implementation subclasses this with whatever additional data it
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
150 /* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
184 int (*tc_install)(struct netdev *netdev, const struct shash *details);
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
222 * This function may be null if 'tc' is not configurable.
224 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' is not configurable.
235 int (*qdisc_set)(struct netdev *, const struct shash *details);
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
248 * This function may be null if 'tc' does not have queues ('n_queues' is
250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
251 struct shash *details);
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
265 const struct shash *details);
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
277 * On success, initializes '*stats'.
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
283 struct netdev_queue_stats *stats);
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
296 tc_init(struct tc *tc, const struct tc_ops *ops)
299 hmap_init(&tc->queues);
303 tc_destroy(struct tc *tc)
305 hmap_destroy(&tc->queues);
308 static const struct tc_ops tc_ops_htb;
309 static const struct tc_ops tc_ops_hfsc;
310 static const struct tc_ops tc_ops_default;
311 static const struct tc_ops tc_ops_other;
313 static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
321 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322 static unsigned int tc_get_major(unsigned int handle);
323 static unsigned int tc_get_minor(unsigned int handle);
325 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329 static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
333 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
334 struct nlattr **options);
335 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
336 struct nlattr **options,
337 struct netdev_queue_stats *);
338 static int tc_query_class(const struct netdev *,
339 unsigned int handle, unsigned int parent,
340 struct ofpbuf **replyp);
341 static int tc_delete_class(const struct netdev *, unsigned int handle);
343 static int tc_del_qdisc(struct netdev *netdev);
344 static int tc_query_qdisc(const struct netdev *netdev);
346 static int tc_calc_cell_log(unsigned int mtu);
347 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
348 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
349 const struct tc_ratespec *rate);
350 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
352 struct netdev_dev_linux {
353 struct netdev_dev netdev_dev;
355 struct shash_node *shash_node;
356 unsigned int cache_valid;
357 unsigned int change_seq;
359 bool miimon; /* Link status of last poll. */
360 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
361 struct timer miimon_timer;
363 /* The following are figured out "on demand" only. They are only valid
364 * when the corresponding VALID_* bit in 'cache_valid' is set. */
366 uint8_t etheraddr[ETH_ADDR_LEN];
367 struct in_addr address, netmask;
371 uint32_t kbits_rate; /* Policing data. */
372 uint32_t kbits_burst;
373 bool have_vport_stats;
377 struct tap_state tap;
381 struct netdev_linux {
382 struct netdev netdev;
386 /* Sockets used for ioctl operations. */
387 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
389 /* A Netlink routing socket that is not subscribed to any multicast groups. */
390 static struct nl_sock *rtnl_sock;
392 /* This is set pretty low because we probably won't learn anything from the
393 * additional log messages. */
394 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
396 static int netdev_linux_init(void);
398 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
399 int cmd, const char *cmd_name);
400 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
401 const char *cmd_name);
402 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
403 int cmd, const char *cmd_name);
404 static int get_flags(const struct netdev *, int *flagsp);
405 static int set_flags(struct netdev *, int flags);
406 static int do_get_ifindex(const char *netdev_name);
407 static int get_ifindex(const struct netdev *, int *ifindexp);
408 static int do_set_addr(struct netdev *netdev,
409 int ioctl_nr, const char *ioctl_name,
410 struct in_addr addr);
411 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
412 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
413 const uint8_t[ETH_ADDR_LEN]);
414 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
415 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
416 static int af_packet_sock(void);
417 static void netdev_linux_miimon_run(void);
418 static void netdev_linux_miimon_wait(void);
421 is_netdev_linux_class(const struct netdev_class *netdev_class)
423 return netdev_class->init == netdev_linux_init;
426 static struct netdev_dev_linux *
427 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
429 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
430 assert(is_netdev_linux_class(netdev_class));
432 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
435 static struct netdev_linux *
436 netdev_linux_cast(const struct netdev *netdev)
438 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
439 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
440 assert(is_netdev_linux_class(netdev_class));
442 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
446 netdev_linux_init(void)
448 static int status = -1;
450 /* Create AF_INET socket. */
451 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
452 status = af_inet_sock >= 0 ? 0 : errno;
454 VLOG_ERR("failed to create inet socket: %s", strerror(status));
457 /* Create rtnetlink socket. */
459 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
461 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
470 netdev_linux_run(void)
472 rtnetlink_link_notifier_run();
473 netdev_linux_miimon_run();
477 netdev_linux_wait(void)
479 rtnetlink_link_notifier_wait();
480 netdev_linux_miimon_wait();
484 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
487 if (!dev->change_seq) {
490 dev->cache_valid = 0;
494 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
495 void *aux OVS_UNUSED)
497 struct netdev_dev_linux *dev;
499 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
501 const struct netdev_class *netdev_class =
502 netdev_dev_get_class(base_dev);
504 if (is_netdev_linux_class(netdev_class)) {
505 dev = netdev_dev_linux_cast(base_dev);
506 netdev_dev_linux_changed(dev);
510 struct shash device_shash;
511 struct shash_node *node;
513 shash_init(&device_shash);
514 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
515 SHASH_FOR_EACH (node, &device_shash) {
517 netdev_dev_linux_changed(dev);
519 shash_destroy(&device_shash);
523 /* Creates system and internal devices. */
525 netdev_linux_create(const struct netdev_class *class, const char *name,
526 struct netdev_dev **netdev_devp)
528 struct netdev_dev_linux *netdev_dev;
531 if (!cache_notifier_refcount) {
532 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
533 netdev_linux_cache_cb, NULL);
538 cache_notifier_refcount++;
540 netdev_dev = xzalloc(sizeof *netdev_dev);
541 netdev_dev->change_seq = 1;
542 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
544 *netdev_devp = &netdev_dev->netdev_dev;
548 /* For most types of netdevs we open the device for each call of
549 * netdev_open(). However, this is not the case with tap devices,
550 * since it is only possible to open the device once. In this
551 * situation we share a single file descriptor, and consequently
552 * buffers, across all readers. Therefore once data is read it will
553 * be unavailable to other reads for tap devices. */
555 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
556 const char *name, struct netdev_dev **netdev_devp)
558 struct netdev_dev_linux *netdev_dev;
559 struct tap_state *state;
560 static const char tap_dev[] = "/dev/net/tun";
564 netdev_dev = xzalloc(sizeof *netdev_dev);
565 state = &netdev_dev->state.tap;
567 /* Open tap device. */
568 state->fd = open(tap_dev, O_RDWR);
571 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
575 /* Create tap device. */
576 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
577 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
578 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
579 VLOG_WARN("%s: creating tap device failed: %s", name,
585 /* Make non-blocking. */
586 error = set_nonblocking(state->fd);
591 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
592 *netdev_devp = &netdev_dev->netdev_dev;
601 destroy_tap(struct netdev_dev_linux *netdev_dev)
603 struct tap_state *state = &netdev_dev->state.tap;
605 if (state->fd >= 0) {
610 /* Destroys the netdev device 'netdev_dev_'. */
612 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
614 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
615 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
617 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
618 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
621 if (class == &netdev_linux_class || class == &netdev_internal_class) {
622 cache_notifier_refcount--;
624 if (!cache_notifier_refcount) {
625 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
627 } else if (class == &netdev_tap_class) {
628 destroy_tap(netdev_dev);
637 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
639 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
640 struct netdev_linux *netdev;
641 enum netdev_flags flags;
644 /* Allocate network device. */
645 netdev = xzalloc(sizeof *netdev);
647 netdev_init(&netdev->netdev, netdev_dev_);
649 /* Verify that the device really exists, by attempting to read its flags.
650 * (The flags might be cached, in which case this won't actually do an
653 * Don't do this for "internal" netdevs, though, because those have to be
654 * created as netdev objects before they exist in the kernel, because
655 * creating them in the kernel happens by passing a netdev object to
656 * dpif_port_add(). */
657 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
658 error = netdev_get_flags(&netdev->netdev, &flags);
659 if (error == ENODEV) {
664 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
665 !netdev_dev->state.tap.opened) {
667 /* We assume that the first user of the tap device is the primary user
668 * and give them the tap FD. Subsequent users probably just expect
669 * this to be a system device so open it normally to avoid send/receive
670 * directions appearing to be reversed. */
671 netdev->fd = netdev_dev->state.tap.fd;
672 netdev_dev->state.tap.opened = true;
675 *netdevp = &netdev->netdev;
679 netdev_uninit(&netdev->netdev, true);
683 /* Closes and destroys 'netdev'. */
685 netdev_linux_close(struct netdev *netdev_)
687 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
689 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
695 /* Initializes 'sset' with a list of the names of all known network devices. */
697 netdev_linux_enumerate(struct sset *sset)
699 struct if_nameindex *names;
701 names = if_nameindex();
705 for (i = 0; names[i].if_name != NULL; i++) {
706 sset_add(sset, names[i].if_name);
708 if_freenameindex(names);
711 VLOG_WARN("could not obtain list of network device names: %s",
718 netdev_linux_listen(struct netdev *netdev_)
720 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
721 struct sockaddr_ll sll;
726 if (netdev->fd >= 0) {
730 /* Create file descriptor. */
731 fd = socket(PF_PACKET, SOCK_RAW, 0);
734 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
738 /* Set non-blocking mode. */
739 error = set_nonblocking(fd);
744 /* Get ethernet device index. */
745 error = get_ifindex(&netdev->netdev, &ifindex);
750 /* Bind to specific ethernet device. */
751 memset(&sll, 0, sizeof sll);
752 sll.sll_family = AF_PACKET;
753 sll.sll_ifindex = ifindex;
754 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
755 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
757 VLOG_ERR("%s: failed to bind raw socket (%s)",
758 netdev_get_name(netdev_), strerror(error));
773 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
775 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
777 if (netdev->fd < 0) {
778 /* Device is not listening. */
783 ssize_t retval = read(netdev->fd, data, size);
786 } else if (errno != EINTR) {
787 if (errno != EAGAIN) {
788 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
789 strerror(errno), netdev_get_name(netdev_));
796 /* Registers with the poll loop to wake up from the next call to poll_block()
797 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
799 netdev_linux_recv_wait(struct netdev *netdev_)
801 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
802 if (netdev->fd >= 0) {
803 poll_fd_wait(netdev->fd, POLLIN);
807 /* Discards all packets waiting to be received from 'netdev'. */
809 netdev_linux_drain(struct netdev *netdev_)
811 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
812 if (netdev->fd < 0) {
814 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
816 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
817 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
821 drain_fd(netdev->fd, ifr.ifr_qlen);
824 return drain_rcvbuf(netdev->fd);
828 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
829 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
830 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
831 * the packet is too big or too small to transmit on the device.
833 * The caller retains ownership of 'buffer' in all cases.
835 * The kernel maintains a packet transmission queue, so the caller is not
836 * expected to do additional queuing of packets. */
838 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
844 if (netdev->fd < 0) {
845 /* Use our AF_PACKET socket to send to this device. */
846 struct sockaddr_ll sll;
853 sock = af_packet_sock();
858 error = get_ifindex(netdev_, &ifindex);
863 /* We don't bother setting most fields in sockaddr_ll because the
864 * kernel ignores them for SOCK_RAW. */
865 memset(&sll, 0, sizeof sll);
866 sll.sll_family = AF_PACKET;
867 sll.sll_ifindex = ifindex;
869 iov.iov_base = (void *) data;
873 msg.msg_namelen = sizeof sll;
876 msg.msg_control = NULL;
877 msg.msg_controllen = 0;
880 retval = sendmsg(sock, &msg, 0);
882 /* Use the netdev's own fd to send to this device. This is
883 * essential for tap devices, because packets sent to a tap device
884 * with an AF_PACKET socket will loop back to be *received* again
885 * on the tap device. */
886 retval = write(netdev->fd, data, size);
890 /* The Linux AF_PACKET implementation never blocks waiting for room
891 * for packets, instead returning ENOBUFS. Translate this into
892 * EAGAIN for the caller. */
893 if (errno == ENOBUFS) {
895 } else if (errno == EINTR) {
897 } else if (errno != EAGAIN) {
898 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
899 netdev_get_name(netdev_), strerror(errno));
902 } else if (retval != size) {
903 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
904 "%zu) on %s", retval, size, netdev_get_name(netdev_));
912 /* Registers with the poll loop to wake up from the next call to poll_block()
913 * when the packet transmission queue has sufficient room to transmit a packet
914 * with netdev_send().
916 * The kernel maintains a packet transmission queue, so the client is not
917 * expected to do additional queuing of packets. Thus, this function is
918 * unlikely to ever be used. It is included for completeness. */
920 netdev_linux_send_wait(struct netdev *netdev_)
922 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
923 if (netdev->fd < 0) {
925 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
926 poll_fd_wait(netdev->fd, POLLOUT);
928 /* TAP device always accepts packets.*/
929 poll_immediate_wake();
933 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
934 * otherwise a positive errno value. */
936 netdev_linux_set_etheraddr(struct netdev *netdev_,
937 const uint8_t mac[ETH_ADDR_LEN])
939 struct netdev_dev_linux *netdev_dev =
940 netdev_dev_linux_cast(netdev_get_dev(netdev_));
943 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
944 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
945 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
947 netdev_dev->cache_valid |= VALID_ETHERADDR;
948 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
956 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
957 * free the returned buffer. */
959 netdev_linux_get_etheraddr(const struct netdev *netdev_,
960 uint8_t mac[ETH_ADDR_LEN])
962 struct netdev_dev_linux *netdev_dev =
963 netdev_dev_linux_cast(netdev_get_dev(netdev_));
964 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
965 int error = get_etheraddr(netdev_get_name(netdev_),
966 netdev_dev->etheraddr);
970 netdev_dev->cache_valid |= VALID_ETHERADDR;
972 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
976 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
977 * in bytes, not including the hardware header; thus, this is typically 1500
978 * bytes for Ethernet devices. */
980 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
982 struct netdev_dev_linux *netdev_dev =
983 netdev_dev_linux_cast(netdev_get_dev(netdev_));
984 if (!(netdev_dev->cache_valid & VALID_MTU)) {
988 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
989 SIOCGIFMTU, "SIOCGIFMTU");
993 netdev_dev->mtu = ifr.ifr_mtu;
994 netdev_dev->cache_valid |= VALID_MTU;
996 *mtup = netdev_dev->mtu;
1000 /* Sets the maximum size of transmitted (MTU) for given device using linux
1001 * networking ioctl interface.
1004 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1006 struct netdev_dev_linux *netdev_dev =
1007 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1012 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1013 SIOCSIFMTU, "SIOCSIFMTU");
1018 netdev_dev->mtu = ifr.ifr_mtu;
1019 netdev_dev->cache_valid |= VALID_MTU;
1023 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1024 * On failure, returns a negative errno value. */
1026 netdev_linux_get_ifindex(const struct netdev *netdev)
1030 error = get_ifindex(netdev, &ifindex);
1031 return error ? -error : ifindex;
1035 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1037 struct netdev_dev_linux *netdev_dev =
1038 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1043 if (netdev_dev->miimon_interval > 0) {
1044 *carrier = netdev_dev->miimon;
1048 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1052 fn = xasprintf("/sys/class/net/%s/carrier",
1053 netdev_get_name(netdev_));
1054 fd = open(fn, O_RDONLY);
1057 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1061 retval = read(fd, line, sizeof line);
1064 if (error == EINVAL) {
1065 /* This is the normal return value when we try to check carrier
1066 * if the network device is not up. */
1068 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1071 } else if (retval == 0) {
1073 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1077 if (line[0] != '0' && line[0] != '1') {
1079 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1083 netdev_dev->carrier = line[0] != '0';
1084 netdev_dev->cache_valid |= VALID_CARRIER;
1086 *carrier = netdev_dev->carrier;
1098 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1099 struct mii_ioctl_data *data)
1104 memset(&ifr, 0, sizeof ifr);
1105 memcpy(&ifr.ifr_data, data, sizeof *data);
1106 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1107 memcpy(data, &ifr.ifr_data, sizeof *data);
1113 netdev_linux_get_miimon(const char *name, bool *miimon)
1115 struct mii_ioctl_data data;
1120 memset(&data, 0, sizeof data);
1121 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1123 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1124 data.reg_num = MII_BMSR;
1125 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1129 *miimon = !!(data.val_out & BMSR_LSTATUS);
1131 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1134 struct ethtool_cmd ecmd;
1136 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1139 memset(&ecmd, 0, sizeof ecmd);
1140 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1143 struct ethtool_value eval;
1145 memcpy(&eval, &ecmd, sizeof eval);
1146 *miimon = !!eval.data;
1148 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1156 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1157 long long int interval)
1159 struct netdev_dev_linux *netdev_dev;
1161 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1163 interval = interval > 0 ? MAX(interval, 100) : 0;
1164 if (netdev_dev->miimon_interval != interval) {
1165 netdev_dev->miimon_interval = interval;
1166 timer_set_expired(&netdev_dev->miimon_timer);
1173 netdev_linux_miimon_run(void)
1175 struct shash device_shash;
1176 struct shash_node *node;
1178 shash_init(&device_shash);
1179 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1180 SHASH_FOR_EACH (node, &device_shash) {
1181 struct netdev_dev_linux *dev = node->data;
1184 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1188 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1189 if (miimon != dev->miimon) {
1190 dev->miimon = miimon;
1191 netdev_dev_linux_changed(dev);
1194 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1197 shash_destroy(&device_shash);
1201 netdev_linux_miimon_wait(void)
1203 struct shash device_shash;
1204 struct shash_node *node;
1206 shash_init(&device_shash);
1207 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1208 SHASH_FOR_EACH (node, &device_shash) {
1209 struct netdev_dev_linux *dev = node->data;
1211 if (dev->miimon_interval > 0) {
1212 timer_wait(&dev->miimon_timer);
1215 shash_destroy(&device_shash);
1218 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1219 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1222 check_for_working_netlink_stats(void)
1224 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1225 * preferable, so if that works, we'll use it. */
1226 int ifindex = do_get_ifindex("lo");
1228 VLOG_WARN("failed to get ifindex for lo, "
1229 "obtaining netdev stats from proc");
1232 struct netdev_stats stats;
1233 int error = get_stats_via_netlink(ifindex, &stats);
1235 VLOG_DBG("obtaining netdev stats via rtnetlink");
1238 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1239 "via proc (you are probably running a pre-2.6.19 "
1240 "kernel)", strerror(error));
1247 swap_uint64(uint64_t *a, uint64_t *b)
1255 get_stats_via_vport(const struct netdev *netdev_,
1256 struct netdev_stats *stats)
1258 struct netdev_dev_linux *netdev_dev =
1259 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1261 if (netdev_dev->have_vport_stats ||
1262 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1265 error = netdev_vport_get_stats(netdev_, stats);
1267 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1268 netdev_get_name(netdev_), error);
1270 netdev_dev->have_vport_stats = !error;
1271 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1276 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1277 struct netdev_stats *stats)
1279 static int use_netlink_stats = -1;
1282 if (use_netlink_stats < 0) {
1283 use_netlink_stats = check_for_working_netlink_stats();
1286 if (use_netlink_stats) {
1289 error = get_ifindex(netdev_, &ifindex);
1291 error = get_stats_via_netlink(ifindex, stats);
1294 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1298 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1299 netdev_get_name(netdev_), error);
1305 /* Retrieves current device stats for 'netdev-linux'. */
1307 netdev_linux_get_stats(const struct netdev *netdev_,
1308 struct netdev_stats *stats)
1310 struct netdev_dev_linux *netdev_dev =
1311 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1312 struct netdev_stats dev_stats;
1315 get_stats_via_vport(netdev_, stats);
1317 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1320 if (!netdev_dev->have_vport_stats) {
1327 if (!netdev_dev->have_vport_stats) {
1328 /* stats not available from OVS then use ioctl stats. */
1331 stats->rx_errors += dev_stats.rx_errors;
1332 stats->tx_errors += dev_stats.tx_errors;
1333 stats->rx_dropped += dev_stats.rx_dropped;
1334 stats->tx_dropped += dev_stats.tx_dropped;
1335 stats->multicast += dev_stats.multicast;
1336 stats->collisions += dev_stats.collisions;
1337 stats->rx_length_errors += dev_stats.rx_length_errors;
1338 stats->rx_over_errors += dev_stats.rx_over_errors;
1339 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1340 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1341 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1342 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1343 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1344 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1345 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1346 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1347 stats->tx_window_errors += dev_stats.tx_window_errors;
1352 /* Retrieves current device stats for 'netdev-tap' netdev or
1353 * netdev-internal. */
1355 netdev_pseudo_get_stats(const struct netdev *netdev_,
1356 struct netdev_stats *stats)
1358 struct netdev_dev_linux *netdev_dev =
1359 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1360 struct netdev_stats dev_stats;
1363 get_stats_via_vport(netdev_, stats);
1365 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1367 if (!netdev_dev->have_vport_stats) {
1374 /* If this port is an internal port then the transmit and receive stats
1375 * will appear to be swapped relative to the other ports since we are the
1376 * one sending the data, not a remote computer. For consistency, we swap
1377 * them back here. This does not apply if we are getting stats from the
1378 * vport layer because it always tracks stats from the perspective of the
1380 if (!netdev_dev->have_vport_stats) {
1382 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1383 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1384 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1385 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1386 stats->rx_length_errors = 0;
1387 stats->rx_over_errors = 0;
1388 stats->rx_crc_errors = 0;
1389 stats->rx_frame_errors = 0;
1390 stats->rx_fifo_errors = 0;
1391 stats->rx_missed_errors = 0;
1392 stats->tx_aborted_errors = 0;
1393 stats->tx_carrier_errors = 0;
1394 stats->tx_fifo_errors = 0;
1395 stats->tx_heartbeat_errors = 0;
1396 stats->tx_window_errors = 0;
1398 stats->rx_dropped += dev_stats.tx_dropped;
1399 stats->tx_dropped += dev_stats.rx_dropped;
1401 stats->rx_errors += dev_stats.tx_errors;
1402 stats->tx_errors += dev_stats.rx_errors;
1404 stats->multicast += dev_stats.multicast;
1405 stats->collisions += dev_stats.collisions;
1410 /* Stores the features supported by 'netdev' into each of '*current',
1411 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1412 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1413 * successful, otherwise a positive errno value. */
1415 netdev_linux_get_features(const struct netdev *netdev,
1416 uint32_t *current, uint32_t *advertised,
1417 uint32_t *supported, uint32_t *peer)
1419 struct ethtool_cmd ecmd;
1422 memset(&ecmd, 0, sizeof ecmd);
1423 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1424 ETHTOOL_GSET, "ETHTOOL_GSET");
1429 /* Supported features. */
1431 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1432 *supported |= OFPPF_10MB_HD;
1434 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1435 *supported |= OFPPF_10MB_FD;
1437 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1438 *supported |= OFPPF_100MB_HD;
1440 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1441 *supported |= OFPPF_100MB_FD;
1443 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1444 *supported |= OFPPF_1GB_HD;
1446 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1447 *supported |= OFPPF_1GB_FD;
1449 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1450 *supported |= OFPPF_10GB_FD;
1452 if (ecmd.supported & SUPPORTED_TP) {
1453 *supported |= OFPPF_COPPER;
1455 if (ecmd.supported & SUPPORTED_FIBRE) {
1456 *supported |= OFPPF_FIBER;
1458 if (ecmd.supported & SUPPORTED_Autoneg) {
1459 *supported |= OFPPF_AUTONEG;
1461 if (ecmd.supported & SUPPORTED_Pause) {
1462 *supported |= OFPPF_PAUSE;
1464 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1465 *supported |= OFPPF_PAUSE_ASYM;
1468 /* Advertised features. */
1470 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1471 *advertised |= OFPPF_10MB_HD;
1473 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1474 *advertised |= OFPPF_10MB_FD;
1476 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1477 *advertised |= OFPPF_100MB_HD;
1479 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1480 *advertised |= OFPPF_100MB_FD;
1482 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1483 *advertised |= OFPPF_1GB_HD;
1485 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1486 *advertised |= OFPPF_1GB_FD;
1488 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1489 *advertised |= OFPPF_10GB_FD;
1491 if (ecmd.advertising & ADVERTISED_TP) {
1492 *advertised |= OFPPF_COPPER;
1494 if (ecmd.advertising & ADVERTISED_FIBRE) {
1495 *advertised |= OFPPF_FIBER;
1497 if (ecmd.advertising & ADVERTISED_Autoneg) {
1498 *advertised |= OFPPF_AUTONEG;
1500 if (ecmd.advertising & ADVERTISED_Pause) {
1501 *advertised |= OFPPF_PAUSE;
1503 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1504 *advertised |= OFPPF_PAUSE_ASYM;
1507 /* Current settings. */
1508 if (ecmd.speed == SPEED_10) {
1509 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1510 } else if (ecmd.speed == SPEED_100) {
1511 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1512 } else if (ecmd.speed == SPEED_1000) {
1513 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1514 } else if (ecmd.speed == SPEED_10000) {
1515 *current = OFPPF_10GB_FD;
1520 if (ecmd.port == PORT_TP) {
1521 *current |= OFPPF_COPPER;
1522 } else if (ecmd.port == PORT_FIBRE) {
1523 *current |= OFPPF_FIBER;
1527 *current |= OFPPF_AUTONEG;
1530 /* Peer advertisements. */
1531 *peer = 0; /* XXX */
1536 /* Set the features advertised by 'netdev' to 'advertise'. */
1538 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1540 struct ethtool_cmd ecmd;
1543 memset(&ecmd, 0, sizeof ecmd);
1544 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1545 ETHTOOL_GSET, "ETHTOOL_GSET");
1550 ecmd.advertising = 0;
1551 if (advertise & OFPPF_10MB_HD) {
1552 ecmd.advertising |= ADVERTISED_10baseT_Half;
1554 if (advertise & OFPPF_10MB_FD) {
1555 ecmd.advertising |= ADVERTISED_10baseT_Full;
1557 if (advertise & OFPPF_100MB_HD) {
1558 ecmd.advertising |= ADVERTISED_100baseT_Half;
1560 if (advertise & OFPPF_100MB_FD) {
1561 ecmd.advertising |= ADVERTISED_100baseT_Full;
1563 if (advertise & OFPPF_1GB_HD) {
1564 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1566 if (advertise & OFPPF_1GB_FD) {
1567 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1569 if (advertise & OFPPF_10GB_FD) {
1570 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1572 if (advertise & OFPPF_COPPER) {
1573 ecmd.advertising |= ADVERTISED_TP;
1575 if (advertise & OFPPF_FIBER) {
1576 ecmd.advertising |= ADVERTISED_FIBRE;
1578 if (advertise & OFPPF_AUTONEG) {
1579 ecmd.advertising |= ADVERTISED_Autoneg;
1581 if (advertise & OFPPF_PAUSE) {
1582 ecmd.advertising |= ADVERTISED_Pause;
1584 if (advertise & OFPPF_PAUSE_ASYM) {
1585 ecmd.advertising |= ADVERTISED_Asym_Pause;
1587 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1588 ETHTOOL_SSET, "ETHTOOL_SSET");
1591 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1592 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1593 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1594 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1595 * sets '*vlan_vid' to -1. */
1597 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1599 const char *netdev_name = netdev_get_name(netdev);
1600 struct ds line = DS_EMPTY_INITIALIZER;
1601 FILE *stream = NULL;
1605 COVERAGE_INC(netdev_get_vlan_vid);
1606 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1607 stream = fopen(fn, "r");
1613 if (ds_get_line(&line, stream)) {
1614 if (ferror(stream)) {
1616 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1619 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1624 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1626 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1627 fn, ds_cstr(&line));
1645 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1646 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1648 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1649 * positive errno value.
1651 * This function is equivalent to running
1652 * /sbin/tc qdisc del dev %s handle ffff: ingress
1653 * but it is much, much faster.
1656 netdev_linux_remove_policing(struct netdev *netdev)
1658 struct netdev_dev_linux *netdev_dev =
1659 netdev_dev_linux_cast(netdev_get_dev(netdev));
1660 const char *netdev_name = netdev_get_name(netdev);
1662 struct ofpbuf request;
1663 struct tcmsg *tcmsg;
1666 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1670 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1671 tcmsg->tcm_parent = TC_H_INGRESS;
1672 nl_msg_put_string(&request, TCA_KIND, "ingress");
1673 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1675 error = tc_transact(&request, NULL);
1676 if (error && error != ENOENT && error != EINVAL) {
1677 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1678 netdev_name, strerror(error));
1682 netdev_dev->kbits_rate = 0;
1683 netdev_dev->kbits_burst = 0;
1684 netdev_dev->cache_valid |= VALID_POLICING;
1688 /* Attempts to set input rate limiting (policing) policy. */
1690 netdev_linux_set_policing(struct netdev *netdev,
1691 uint32_t kbits_rate, uint32_t kbits_burst)
1693 struct netdev_dev_linux *netdev_dev =
1694 netdev_dev_linux_cast(netdev_get_dev(netdev));
1695 const char *netdev_name = netdev_get_name(netdev);
1698 COVERAGE_INC(netdev_set_policing);
1700 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1701 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1702 : kbits_burst); /* Stick with user-specified value. */
1704 if (netdev_dev->cache_valid & VALID_POLICING
1705 && netdev_dev->kbits_rate == kbits_rate
1706 && netdev_dev->kbits_burst == kbits_burst) {
1707 /* Assume that settings haven't changed since we last set them. */
1711 netdev_linux_remove_policing(netdev);
1713 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1714 if (system(command) != 0) {
1715 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1719 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1720 kbits_rate, kbits_burst);
1721 if (system(command) != 0) {
1722 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1727 netdev_dev->kbits_rate = kbits_rate;
1728 netdev_dev->kbits_burst = kbits_burst;
1729 netdev_dev->cache_valid |= VALID_POLICING;
1736 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1739 const struct tc_ops **opsp;
1741 for (opsp = tcs; *opsp != NULL; opsp++) {
1742 const struct tc_ops *ops = *opsp;
1743 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1744 sset_add(types, ops->ovs_name);
1750 static const struct tc_ops *
1751 tc_lookup_ovs_name(const char *name)
1753 const struct tc_ops **opsp;
1755 for (opsp = tcs; *opsp != NULL; opsp++) {
1756 const struct tc_ops *ops = *opsp;
1757 if (!strcmp(name, ops->ovs_name)) {
1764 static const struct tc_ops *
1765 tc_lookup_linux_name(const char *name)
1767 const struct tc_ops **opsp;
1769 for (opsp = tcs; *opsp != NULL; opsp++) {
1770 const struct tc_ops *ops = *opsp;
1771 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1778 static struct tc_queue *
1779 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1782 struct netdev_dev_linux *netdev_dev =
1783 netdev_dev_linux_cast(netdev_get_dev(netdev));
1784 struct tc_queue *queue;
1786 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1787 if (queue->queue_id == queue_id) {
1794 static struct tc_queue *
1795 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1797 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1801 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1803 struct netdev_qos_capabilities *caps)
1805 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1809 caps->n_queues = ops->n_queues;
1814 netdev_linux_get_qos(const struct netdev *netdev,
1815 const char **typep, struct shash *details)
1817 struct netdev_dev_linux *netdev_dev =
1818 netdev_dev_linux_cast(netdev_get_dev(netdev));
1821 error = tc_query_qdisc(netdev);
1826 *typep = netdev_dev->tc->ops->ovs_name;
1827 return (netdev_dev->tc->ops->qdisc_get
1828 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1833 netdev_linux_set_qos(struct netdev *netdev,
1834 const char *type, const struct shash *details)
1836 struct netdev_dev_linux *netdev_dev =
1837 netdev_dev_linux_cast(netdev_get_dev(netdev));
1838 const struct tc_ops *new_ops;
1841 new_ops = tc_lookup_ovs_name(type);
1842 if (!new_ops || !new_ops->tc_install) {
1846 error = tc_query_qdisc(netdev);
1851 if (new_ops == netdev_dev->tc->ops) {
1852 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1854 /* Delete existing qdisc. */
1855 error = tc_del_qdisc(netdev);
1859 assert(netdev_dev->tc == NULL);
1861 /* Install new qdisc. */
1862 error = new_ops->tc_install(netdev, details);
1863 assert((error == 0) == (netdev_dev->tc != NULL));
1870 netdev_linux_get_queue(const struct netdev *netdev,
1871 unsigned int queue_id, struct shash *details)
1873 struct netdev_dev_linux *netdev_dev =
1874 netdev_dev_linux_cast(netdev_get_dev(netdev));
1877 error = tc_query_qdisc(netdev);
1881 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1883 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1889 netdev_linux_set_queue(struct netdev *netdev,
1890 unsigned int queue_id, const struct shash *details)
1892 struct netdev_dev_linux *netdev_dev =
1893 netdev_dev_linux_cast(netdev_get_dev(netdev));
1896 error = tc_query_qdisc(netdev);
1899 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1900 || !netdev_dev->tc->ops->class_set) {
1904 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1908 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1910 struct netdev_dev_linux *netdev_dev =
1911 netdev_dev_linux_cast(netdev_get_dev(netdev));
1914 error = tc_query_qdisc(netdev);
1917 } else if (!netdev_dev->tc->ops->class_delete) {
1920 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1922 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1928 netdev_linux_get_queue_stats(const struct netdev *netdev,
1929 unsigned int queue_id,
1930 struct netdev_queue_stats *stats)
1932 struct netdev_dev_linux *netdev_dev =
1933 netdev_dev_linux_cast(netdev_get_dev(netdev));
1936 error = tc_query_qdisc(netdev);
1939 } else if (!netdev_dev->tc->ops->class_get_stats) {
1942 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1944 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1950 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1952 struct ofpbuf request;
1953 struct tcmsg *tcmsg;
1955 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1959 tcmsg->tcm_parent = 0;
1960 nl_dump_start(dump, rtnl_sock, &request);
1961 ofpbuf_uninit(&request);
1966 netdev_linux_dump_queues(const struct netdev *netdev,
1967 netdev_dump_queues_cb *cb, void *aux)
1969 struct netdev_dev_linux *netdev_dev =
1970 netdev_dev_linux_cast(netdev_get_dev(netdev));
1971 struct tc_queue *queue;
1972 struct shash details;
1976 error = tc_query_qdisc(netdev);
1979 } else if (!netdev_dev->tc->ops->class_get) {
1984 shash_init(&details);
1985 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1986 shash_clear(&details);
1988 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1990 (*cb)(queue->queue_id, &details, aux);
1995 shash_destroy(&details);
2001 netdev_linux_dump_queue_stats(const struct netdev *netdev,
2002 netdev_dump_queue_stats_cb *cb, void *aux)
2004 struct netdev_dev_linux *netdev_dev =
2005 netdev_dev_linux_cast(netdev_get_dev(netdev));
2006 struct nl_dump dump;
2011 error = tc_query_qdisc(netdev);
2014 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2019 if (!start_queue_dump(netdev, &dump)) {
2022 while (nl_dump_next(&dump, &msg)) {
2023 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2029 error = nl_dump_done(&dump);
2030 return error ? error : last_error;
2034 netdev_linux_get_in4(const struct netdev *netdev_,
2035 struct in_addr *address, struct in_addr *netmask)
2037 struct netdev_dev_linux *netdev_dev =
2038 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2040 if (!(netdev_dev->cache_valid & VALID_IN4)) {
2043 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
2044 SIOCGIFADDR, "SIOCGIFADDR");
2049 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
2050 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2055 netdev_dev->cache_valid |= VALID_IN4;
2057 *address = netdev_dev->address;
2058 *netmask = netdev_dev->netmask;
2059 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
2063 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2064 struct in_addr netmask)
2066 struct netdev_dev_linux *netdev_dev =
2067 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2070 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
2072 netdev_dev->cache_valid |= VALID_IN4;
2073 netdev_dev->address = address;
2074 netdev_dev->netmask = netmask;
2075 if (address.s_addr != INADDR_ANY) {
2076 error = do_set_addr(netdev_, SIOCSIFNETMASK,
2077 "SIOCSIFNETMASK", netmask);
2084 parse_if_inet6_line(const char *line,
2085 struct in6_addr *in6, char ifname[16 + 1])
2087 uint8_t *s6 = in6->s6_addr;
2088 #define X8 "%2"SCNx8
2090 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2091 "%*x %*x %*x %*x %16s\n",
2092 &s6[0], &s6[1], &s6[2], &s6[3],
2093 &s6[4], &s6[5], &s6[6], &s6[7],
2094 &s6[8], &s6[9], &s6[10], &s6[11],
2095 &s6[12], &s6[13], &s6[14], &s6[15],
2099 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2100 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2102 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2104 struct netdev_dev_linux *netdev_dev =
2105 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2106 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2110 netdev_dev->in6 = in6addr_any;
2112 file = fopen("/proc/net/if_inet6", "r");
2114 const char *name = netdev_get_name(netdev_);
2115 while (fgets(line, sizeof line, file)) {
2116 struct in6_addr in6_tmp;
2117 char ifname[16 + 1];
2118 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2119 && !strcmp(name, ifname))
2121 netdev_dev->in6 = in6_tmp;
2127 netdev_dev->cache_valid |= VALID_IN6;
2129 *in6 = netdev_dev->in6;
2134 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2136 struct sockaddr_in sin;
2137 memset(&sin, 0, sizeof sin);
2138 sin.sin_family = AF_INET;
2139 sin.sin_addr = addr;
2142 memset(sa, 0, sizeof *sa);
2143 memcpy(sa, &sin, sizeof sin);
2147 do_set_addr(struct netdev *netdev,
2148 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2151 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2152 make_in4_sockaddr(&ifr.ifr_addr, addr);
2154 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2158 /* Adds 'router' as a default IP gateway. */
2160 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2162 struct in_addr any = { INADDR_ANY };
2166 memset(&rt, 0, sizeof rt);
2167 make_in4_sockaddr(&rt.rt_dst, any);
2168 make_in4_sockaddr(&rt.rt_gateway, router);
2169 make_in4_sockaddr(&rt.rt_genmask, any);
2170 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2171 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2173 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2179 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2182 static const char fn[] = "/proc/net/route";
2187 *netdev_name = NULL;
2188 stream = fopen(fn, "r");
2189 if (stream == NULL) {
2190 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2195 while (fgets(line, sizeof line, stream)) {
2198 ovs_be32 dest, gateway, mask;
2199 int refcnt, metric, mtu;
2200 unsigned int flags, use, window, irtt;
2203 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2205 iface, &dest, &gateway, &flags, &refcnt,
2206 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2208 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2212 if (!(flags & RTF_UP)) {
2213 /* Skip routes that aren't up. */
2217 /* The output of 'dest', 'mask', and 'gateway' were given in
2218 * network byte order, so we don't need need any endian
2219 * conversions here. */
2220 if ((dest & mask) == (host->s_addr & mask)) {
2222 /* The host is directly reachable. */
2223 next_hop->s_addr = 0;
2225 /* To reach the host, we must go through a gateway. */
2226 next_hop->s_addr = gateway;
2228 *netdev_name = xstrdup(iface);
2240 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2242 struct ethtool_drvinfo drvinfo;
2245 memset(&drvinfo, 0, sizeof drvinfo);
2246 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2247 (struct ethtool_cmd *)&drvinfo,
2249 "ETHTOOL_GDRVINFO");
2251 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2252 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2253 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2259 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2260 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2261 * returns 0. Otherwise, it returns a positive errno value; in particular,
2262 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2264 netdev_linux_arp_lookup(const struct netdev *netdev,
2265 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2268 struct sockaddr_in sin;
2271 memset(&r, 0, sizeof r);
2272 memset(&sin, 0, sizeof sin);
2273 sin.sin_family = AF_INET;
2274 sin.sin_addr.s_addr = ip;
2276 memcpy(&r.arp_pa, &sin, sizeof sin);
2277 r.arp_ha.sa_family = ARPHRD_ETHER;
2279 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2280 COVERAGE_INC(netdev_arp_lookup);
2281 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2283 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2284 } else if (retval != ENXIO) {
2285 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2286 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2292 nd_to_iff_flags(enum netdev_flags nd)
2295 if (nd & NETDEV_UP) {
2298 if (nd & NETDEV_PROMISC) {
2305 iff_to_nd_flags(int iff)
2307 enum netdev_flags nd = 0;
2311 if (iff & IFF_PROMISC) {
2312 nd |= NETDEV_PROMISC;
2318 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2319 enum netdev_flags on, enum netdev_flags *old_flagsp)
2321 int old_flags, new_flags;
2324 error = get_flags(netdev, &old_flags);
2326 *old_flagsp = iff_to_nd_flags(old_flags);
2327 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2328 if (new_flags != old_flags) {
2329 error = set_flags(netdev, new_flags);
2336 netdev_linux_change_seq(const struct netdev *netdev)
2338 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2341 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, GET_STATS, SET_STATS) \
2345 netdev_linux_init, \
2347 netdev_linux_wait, \
2350 netdev_linux_destroy, \
2351 NULL, /* get_config */ \
2352 NULL, /* set_config */ \
2354 netdev_linux_open, \
2355 netdev_linux_close, \
2359 netdev_linux_listen, \
2360 netdev_linux_recv, \
2361 netdev_linux_recv_wait, \
2362 netdev_linux_drain, \
2364 netdev_linux_send, \
2365 netdev_linux_send_wait, \
2367 netdev_linux_set_etheraddr, \
2368 netdev_linux_get_etheraddr, \
2369 netdev_linux_get_mtu, \
2370 netdev_linux_set_mtu, \
2371 netdev_linux_get_ifindex, \
2372 netdev_linux_get_carrier, \
2373 netdev_linux_set_miimon_interval, \
2377 netdev_linux_get_features, \
2378 netdev_linux_set_advertisements, \
2379 netdev_linux_get_vlan_vid, \
2381 netdev_linux_set_policing, \
2382 netdev_linux_get_qos_types, \
2383 netdev_linux_get_qos_capabilities, \
2384 netdev_linux_get_qos, \
2385 netdev_linux_set_qos, \
2386 netdev_linux_get_queue, \
2387 netdev_linux_set_queue, \
2388 netdev_linux_delete_queue, \
2389 netdev_linux_get_queue_stats, \
2390 netdev_linux_dump_queues, \
2391 netdev_linux_dump_queue_stats, \
2393 netdev_linux_get_in4, \
2394 netdev_linux_set_in4, \
2395 netdev_linux_get_in6, \
2396 netdev_linux_add_router, \
2397 netdev_linux_get_next_hop, \
2398 netdev_linux_get_status, \
2399 netdev_linux_arp_lookup, \
2401 netdev_linux_update_flags, \
2403 netdev_linux_change_seq \
2406 const struct netdev_class netdev_linux_class =
2409 netdev_linux_create,
2410 netdev_linux_enumerate,
2411 netdev_linux_get_stats,
2412 NULL); /* set_stats */
2414 const struct netdev_class netdev_tap_class =
2417 netdev_linux_create_tap,
2418 NULL, /* enumerate */
2419 netdev_pseudo_get_stats,
2420 NULL); /* set_stats */
2422 const struct netdev_class netdev_internal_class =
2425 netdev_linux_create,
2426 NULL, /* enumerate */
2427 netdev_pseudo_get_stats,
2428 netdev_vport_set_stats);
2430 /* HTB traffic control class. */
2432 #define HTB_N_QUEUES 0xf000
2436 unsigned int max_rate; /* In bytes/s. */
2440 struct tc_queue tc_queue;
2441 unsigned int min_rate; /* In bytes/s. */
2442 unsigned int max_rate; /* In bytes/s. */
2443 unsigned int burst; /* In bytes. */
2444 unsigned int priority; /* Lower values are higher priorities. */
2448 htb_get__(const struct netdev *netdev)
2450 struct netdev_dev_linux *netdev_dev =
2451 netdev_dev_linux_cast(netdev_get_dev(netdev));
2452 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2456 htb_install__(struct netdev *netdev, uint64_t max_rate)
2458 struct netdev_dev_linux *netdev_dev =
2459 netdev_dev_linux_cast(netdev_get_dev(netdev));
2462 htb = xmalloc(sizeof *htb);
2463 tc_init(&htb->tc, &tc_ops_htb);
2464 htb->max_rate = max_rate;
2466 netdev_dev->tc = &htb->tc;
2469 /* Create an HTB qdisc.
2471 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2473 htb_setup_qdisc__(struct netdev *netdev)
2476 struct tc_htb_glob opt;
2477 struct ofpbuf request;
2478 struct tcmsg *tcmsg;
2480 tc_del_qdisc(netdev);
2482 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2483 NLM_F_EXCL | NLM_F_CREATE, &request);
2487 tcmsg->tcm_handle = tc_make_handle(1, 0);
2488 tcmsg->tcm_parent = TC_H_ROOT;
2490 nl_msg_put_string(&request, TCA_KIND, "htb");
2492 memset(&opt, 0, sizeof opt);
2493 opt.rate2quantum = 10;
2497 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2498 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2499 nl_msg_end_nested(&request, opt_offset);
2501 return tc_transact(&request, NULL);
2504 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2505 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2507 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2508 unsigned int parent, struct htb_class *class)
2511 struct tc_htb_opt opt;
2512 struct ofpbuf request;
2513 struct tcmsg *tcmsg;
2517 error = netdev_get_mtu(netdev, &mtu);
2519 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2520 netdev_get_name(netdev));
2524 memset(&opt, 0, sizeof opt);
2525 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2526 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2527 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2528 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2529 opt.prio = class->priority;
2531 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2535 tcmsg->tcm_handle = handle;
2536 tcmsg->tcm_parent = parent;
2538 nl_msg_put_string(&request, TCA_KIND, "htb");
2539 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2540 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2541 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2542 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2543 nl_msg_end_nested(&request, opt_offset);
2545 error = tc_transact(&request, NULL);
2547 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2548 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2549 netdev_get_name(netdev),
2550 tc_get_major(handle), tc_get_minor(handle),
2551 tc_get_major(parent), tc_get_minor(parent),
2552 class->min_rate, class->max_rate,
2553 class->burst, class->priority, strerror(error));
2558 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2559 * description of them into 'details'. The description complies with the
2560 * specification given in the vswitch database documentation for linux-htb
2563 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2565 static const struct nl_policy tca_htb_policy[] = {
2566 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2567 .min_len = sizeof(struct tc_htb_opt) },
2570 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2571 const struct tc_htb_opt *htb;
2573 if (!nl_parse_nested(nl_options, tca_htb_policy,
2574 attrs, ARRAY_SIZE(tca_htb_policy))) {
2575 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2579 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2580 class->min_rate = htb->rate.rate;
2581 class->max_rate = htb->ceil.rate;
2582 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2583 class->priority = htb->prio;
2588 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2589 struct htb_class *options,
2590 struct netdev_queue_stats *stats)
2592 struct nlattr *nl_options;
2593 unsigned int handle;
2596 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2597 if (!error && queue_id) {
2598 unsigned int major = tc_get_major(handle);
2599 unsigned int minor = tc_get_minor(handle);
2600 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2601 *queue_id = minor - 1;
2606 if (!error && options) {
2607 error = htb_parse_tca_options__(nl_options, options);
2613 htb_parse_qdisc_details__(struct netdev *netdev,
2614 const struct shash *details, struct htb_class *hc)
2616 const char *max_rate_s;
2618 max_rate_s = shash_find_data(details, "max-rate");
2619 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2620 if (!hc->max_rate) {
2623 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2624 hc->max_rate = netdev_features_to_bps(current) / 8;
2626 hc->min_rate = hc->max_rate;
2632 htb_parse_class_details__(struct netdev *netdev,
2633 const struct shash *details, struct htb_class *hc)
2635 const struct htb *htb = htb_get__(netdev);
2636 const char *min_rate_s = shash_find_data(details, "min-rate");
2637 const char *max_rate_s = shash_find_data(details, "max-rate");
2638 const char *burst_s = shash_find_data(details, "burst");
2639 const char *priority_s = shash_find_data(details, "priority");
2642 error = netdev_get_mtu(netdev, &mtu);
2644 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2645 netdev_get_name(netdev));
2649 /* HTB requires at least an mtu sized min-rate to send any traffic even
2650 * on uncongested links. */
2651 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2652 hc->min_rate = MAX(hc->min_rate, mtu);
2653 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2656 hc->max_rate = (max_rate_s
2657 ? strtoull(max_rate_s, NULL, 10) / 8
2659 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2660 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2664 * According to hints in the documentation that I've read, it is important
2665 * that 'burst' be at least as big as the largest frame that might be
2666 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2667 * but having it a bit too small is a problem. Since netdev_get_mtu()
2668 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2669 * the MTU. We actually add 64, instead of 14, as a guard against
2670 * additional headers get tacked on somewhere that we're not aware of. */
2671 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2672 hc->burst = MAX(hc->burst, mtu + 64);
2675 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2681 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2682 unsigned int parent, struct htb_class *options,
2683 struct netdev_queue_stats *stats)
2685 struct ofpbuf *reply;
2688 error = tc_query_class(netdev, handle, parent, &reply);
2690 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2691 ofpbuf_delete(reply);
2697 htb_tc_install(struct netdev *netdev, const struct shash *details)
2701 error = htb_setup_qdisc__(netdev);
2703 struct htb_class hc;
2705 htb_parse_qdisc_details__(netdev, details, &hc);
2706 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2707 tc_make_handle(1, 0), &hc);
2709 htb_install__(netdev, hc.max_rate);
2715 static struct htb_class *
2716 htb_class_cast__(const struct tc_queue *queue)
2718 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2722 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2723 const struct htb_class *hc)
2725 struct htb *htb = htb_get__(netdev);
2726 size_t hash = hash_int(queue_id, 0);
2727 struct tc_queue *queue;
2728 struct htb_class *hcp;
2730 queue = tc_find_queue__(netdev, queue_id, hash);
2732 hcp = htb_class_cast__(queue);
2734 hcp = xmalloc(sizeof *hcp);
2735 queue = &hcp->tc_queue;
2736 queue->queue_id = queue_id;
2737 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2740 hcp->min_rate = hc->min_rate;
2741 hcp->max_rate = hc->max_rate;
2742 hcp->burst = hc->burst;
2743 hcp->priority = hc->priority;
2747 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2750 struct nl_dump dump;
2751 struct htb_class hc;
2753 /* Get qdisc options. */
2755 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2756 htb_install__(netdev, hc.max_rate);
2759 if (!start_queue_dump(netdev, &dump)) {
2762 while (nl_dump_next(&dump, &msg)) {
2763 unsigned int queue_id;
2765 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2766 htb_update_queue__(netdev, queue_id, &hc);
2769 nl_dump_done(&dump);
2775 htb_tc_destroy(struct tc *tc)
2777 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2778 struct htb_class *hc, *next;
2780 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2781 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2789 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2791 const struct htb *htb = htb_get__(netdev);
2792 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2797 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2799 struct htb_class hc;
2802 htb_parse_qdisc_details__(netdev, details, &hc);
2803 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2804 tc_make_handle(1, 0), &hc);
2806 htb_get__(netdev)->max_rate = hc.max_rate;
2812 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2813 const struct tc_queue *queue, struct shash *details)
2815 const struct htb_class *hc = htb_class_cast__(queue);
2817 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2818 if (hc->min_rate != hc->max_rate) {
2819 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2821 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2823 shash_add(details, "priority", xasprintf("%u", hc->priority));
2829 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2830 const struct shash *details)
2832 struct htb_class hc;
2835 error = htb_parse_class_details__(netdev, details, &hc);
2840 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2841 tc_make_handle(1, 0xfffe), &hc);
2846 htb_update_queue__(netdev, queue_id, &hc);
2851 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2853 struct htb_class *hc = htb_class_cast__(queue);
2854 struct htb *htb = htb_get__(netdev);
2857 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2859 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2866 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2867 struct netdev_queue_stats *stats)
2869 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2870 tc_make_handle(1, 0xfffe), NULL, stats);
2874 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2875 const struct ofpbuf *nlmsg,
2876 netdev_dump_queue_stats_cb *cb, void *aux)
2878 struct netdev_queue_stats stats;
2879 unsigned int handle, major, minor;
2882 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2887 major = tc_get_major(handle);
2888 minor = tc_get_minor(handle);
2889 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2890 (*cb)(minor - 1, &stats, aux);
2895 static const struct tc_ops tc_ops_htb = {
2896 "htb", /* linux_name */
2897 "linux-htb", /* ovs_name */
2898 HTB_N_QUEUES, /* n_queues */
2907 htb_class_get_stats,
2908 htb_class_dump_stats
2911 /* "linux-hfsc" traffic control class. */
2913 #define HFSC_N_QUEUES 0xf000
2921 struct tc_queue tc_queue;
2926 static struct hfsc *
2927 hfsc_get__(const struct netdev *netdev)
2929 struct netdev_dev_linux *netdev_dev;
2930 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2931 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2934 static struct hfsc_class *
2935 hfsc_class_cast__(const struct tc_queue *queue)
2937 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2941 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2943 struct netdev_dev_linux * netdev_dev;
2946 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2947 hfsc = xmalloc(sizeof *hfsc);
2948 tc_init(&hfsc->tc, &tc_ops_hfsc);
2949 hfsc->max_rate = max_rate;
2950 netdev_dev->tc = &hfsc->tc;
2954 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2955 const struct hfsc_class *hc)
2959 struct hfsc_class *hcp;
2960 struct tc_queue *queue;
2962 hfsc = hfsc_get__(netdev);
2963 hash = hash_int(queue_id, 0);
2965 queue = tc_find_queue__(netdev, queue_id, hash);
2967 hcp = hfsc_class_cast__(queue);
2969 hcp = xmalloc(sizeof *hcp);
2970 queue = &hcp->tc_queue;
2971 queue->queue_id = queue_id;
2972 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2975 hcp->min_rate = hc->min_rate;
2976 hcp->max_rate = hc->max_rate;
2980 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2982 const struct tc_service_curve *rsc, *fsc, *usc;
2983 static const struct nl_policy tca_hfsc_policy[] = {
2985 .type = NL_A_UNSPEC,
2987 .min_len = sizeof(struct tc_service_curve),
2990 .type = NL_A_UNSPEC,
2992 .min_len = sizeof(struct tc_service_curve),
2995 .type = NL_A_UNSPEC,
2997 .min_len = sizeof(struct tc_service_curve),
3000 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3002 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3003 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3004 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3008 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3009 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3010 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3012 if (rsc->m1 != 0 || rsc->d != 0 ||
3013 fsc->m1 != 0 || fsc->d != 0 ||
3014 usc->m1 != 0 || usc->d != 0) {
3015 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3016 "Non-linear service curves are not supported.");
3020 if (rsc->m2 != fsc->m2) {
3021 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3022 "Real-time service curves are not supported ");
3026 if (rsc->m2 > usc->m2) {
3027 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3028 "Min-rate service curve is greater than "
3029 "the max-rate service curve.");
3033 class->min_rate = fsc->m2;
3034 class->max_rate = usc->m2;
3039 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3040 struct hfsc_class *options,
3041 struct netdev_queue_stats *stats)
3044 unsigned int handle;
3045 struct nlattr *nl_options;
3047 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3053 unsigned int major, minor;
3055 major = tc_get_major(handle);
3056 minor = tc_get_minor(handle);
3057 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3058 *queue_id = minor - 1;
3065 error = hfsc_parse_tca_options__(nl_options, options);
3072 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3073 unsigned int parent, struct hfsc_class *options,
3074 struct netdev_queue_stats *stats)
3077 struct ofpbuf *reply;
3079 error = tc_query_class(netdev, handle, parent, &reply);
3084 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3085 ofpbuf_delete(reply);
3090 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3091 struct hfsc_class *class)
3094 const char *max_rate_s;
3096 max_rate_s = shash_find_data(details, "max-rate");
3097 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3102 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3103 max_rate = netdev_features_to_bps(current) / 8;
3106 class->min_rate = max_rate;
3107 class->max_rate = max_rate;
3111 hfsc_parse_class_details__(struct netdev *netdev,
3112 const struct shash *details,
3113 struct hfsc_class * class)
3115 const struct hfsc *hfsc;
3116 uint32_t min_rate, max_rate;
3117 const char *min_rate_s, *max_rate_s;
3119 hfsc = hfsc_get__(netdev);
3120 min_rate_s = shash_find_data(details, "min-rate");
3121 max_rate_s = shash_find_data(details, "max-rate");
3123 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3124 min_rate = MAX(min_rate, 1);
3125 min_rate = MIN(min_rate, hfsc->max_rate);
3127 max_rate = (max_rate_s
3128 ? strtoull(max_rate_s, NULL, 10) / 8
3130 max_rate = MAX(max_rate, min_rate);
3131 max_rate = MIN(max_rate, hfsc->max_rate);
3133 class->min_rate = min_rate;
3134 class->max_rate = max_rate;
3139 /* Create an HFSC qdisc.
3141 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3143 hfsc_setup_qdisc__(struct netdev * netdev)
3145 struct tcmsg *tcmsg;
3146 struct ofpbuf request;
3147 struct tc_hfsc_qopt opt;
3149 tc_del_qdisc(netdev);
3151 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3152 NLM_F_EXCL | NLM_F_CREATE, &request);
3158 tcmsg->tcm_handle = tc_make_handle(1, 0);
3159 tcmsg->tcm_parent = TC_H_ROOT;
3161 memset(&opt, 0, sizeof opt);
3164 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3165 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3167 return tc_transact(&request, NULL);
3170 /* Create an HFSC class.
3172 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3173 * sc rate <min_rate> ul rate <max_rate>" */
3175 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3176 unsigned int parent, struct hfsc_class *class)
3180 struct tcmsg *tcmsg;
3181 struct ofpbuf request;
3182 struct tc_service_curve min, max;
3184 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3190 tcmsg->tcm_handle = handle;
3191 tcmsg->tcm_parent = parent;
3195 min.m2 = class->min_rate;
3199 max.m2 = class->max_rate;
3201 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3202 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3203 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3204 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3205 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3206 nl_msg_end_nested(&request, opt_offset);
3208 error = tc_transact(&request, NULL);
3210 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3211 "min-rate %ubps, max-rate %ubps (%s)",
3212 netdev_get_name(netdev),
3213 tc_get_major(handle), tc_get_minor(handle),
3214 tc_get_major(parent), tc_get_minor(parent),
3215 class->min_rate, class->max_rate, strerror(error));
3222 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3225 struct hfsc_class class;
3227 error = hfsc_setup_qdisc__(netdev);
3233 hfsc_parse_qdisc_details__(netdev, details, &class);
3234 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3235 tc_make_handle(1, 0), &class);
3241 hfsc_install__(netdev, class.max_rate);
3246 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3249 struct nl_dump dump;
3250 struct hfsc_class hc;
3253 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3254 hfsc_install__(netdev, hc.max_rate);
3256 if (!start_queue_dump(netdev, &dump)) {
3260 while (nl_dump_next(&dump, &msg)) {
3261 unsigned int queue_id;
3263 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3264 hfsc_update_queue__(netdev, queue_id, &hc);
3268 nl_dump_done(&dump);
3273 hfsc_tc_destroy(struct tc *tc)
3276 struct hfsc_class *hc, *next;
3278 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3280 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3281 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3290 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3292 const struct hfsc *hfsc;
3293 hfsc = hfsc_get__(netdev);
3294 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3299 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3302 struct hfsc_class class;
3304 hfsc_parse_qdisc_details__(netdev, details, &class);
3305 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3306 tc_make_handle(1, 0), &class);
3309 hfsc_get__(netdev)->max_rate = class.max_rate;
3316 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3317 const struct tc_queue *queue, struct shash *details)
3319 const struct hfsc_class *hc;
3321 hc = hfsc_class_cast__(queue);
3322 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3323 if (hc->min_rate != hc->max_rate) {
3324 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3330 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3331 const struct shash *details)
3334 struct hfsc_class class;
3336 error = hfsc_parse_class_details__(netdev, details, &class);
3341 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3342 tc_make_handle(1, 0xfffe), &class);
3347 hfsc_update_queue__(netdev, queue_id, &class);
3352 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3356 struct hfsc_class *hc;
3358 hc = hfsc_class_cast__(queue);
3359 hfsc = hfsc_get__(netdev);
3361 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3363 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3370 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3371 struct netdev_queue_stats *stats)
3373 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3374 tc_make_handle(1, 0xfffe), NULL, stats);
3378 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3379 const struct ofpbuf *nlmsg,
3380 netdev_dump_queue_stats_cb *cb, void *aux)
3382 struct netdev_queue_stats stats;
3383 unsigned int handle, major, minor;
3386 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3391 major = tc_get_major(handle);
3392 minor = tc_get_minor(handle);
3393 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3394 (*cb)(minor - 1, &stats, aux);
3399 static const struct tc_ops tc_ops_hfsc = {
3400 "hfsc", /* linux_name */
3401 "linux-hfsc", /* ovs_name */
3402 HFSC_N_QUEUES, /* n_queues */
3403 hfsc_tc_install, /* tc_install */
3404 hfsc_tc_load, /* tc_load */
3405 hfsc_tc_destroy, /* tc_destroy */
3406 hfsc_qdisc_get, /* qdisc_get */
3407 hfsc_qdisc_set, /* qdisc_set */
3408 hfsc_class_get, /* class_get */
3409 hfsc_class_set, /* class_set */
3410 hfsc_class_delete, /* class_delete */
3411 hfsc_class_get_stats, /* class_get_stats */
3412 hfsc_class_dump_stats /* class_dump_stats */
3415 /* "linux-default" traffic control class.
3417 * This class represents the default, unnamed Linux qdisc. It corresponds to
3418 * the "" (empty string) QoS type in the OVS database. */
3421 default_install__(struct netdev *netdev)
3423 struct netdev_dev_linux *netdev_dev =
3424 netdev_dev_linux_cast(netdev_get_dev(netdev));
3425 static struct tc *tc;
3428 tc = xmalloc(sizeof *tc);
3429 tc_init(tc, &tc_ops_default);
3431 netdev_dev->tc = tc;
3435 default_tc_install(struct netdev *netdev,
3436 const struct shash *details OVS_UNUSED)
3438 default_install__(netdev);
3443 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3445 default_install__(netdev);
3449 static const struct tc_ops tc_ops_default = {
3450 NULL, /* linux_name */
3455 NULL, /* tc_destroy */
3456 NULL, /* qdisc_get */
3457 NULL, /* qdisc_set */
3458 NULL, /* class_get */
3459 NULL, /* class_set */
3460 NULL, /* class_delete */
3461 NULL, /* class_get_stats */
3462 NULL /* class_dump_stats */
3465 /* "linux-other" traffic control class.
3470 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3472 struct netdev_dev_linux *netdev_dev =
3473 netdev_dev_linux_cast(netdev_get_dev(netdev));
3474 static struct tc *tc;
3477 tc = xmalloc(sizeof *tc);
3478 tc_init(tc, &tc_ops_other);
3480 netdev_dev->tc = tc;
3484 static const struct tc_ops tc_ops_other = {
3485 NULL, /* linux_name */
3486 "linux-other", /* ovs_name */
3488 NULL, /* tc_install */
3490 NULL, /* tc_destroy */
3491 NULL, /* qdisc_get */
3492 NULL, /* qdisc_set */
3493 NULL, /* class_get */
3494 NULL, /* class_set */
3495 NULL, /* class_delete */
3496 NULL, /* class_get_stats */
3497 NULL /* class_dump_stats */
3500 /* Traffic control. */
3502 /* Number of kernel "tc" ticks per second. */
3503 static double ticks_per_s;
3505 /* Number of kernel "jiffies" per second. This is used for the purpose of
3506 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3507 * one jiffy's worth of data.
3509 * There are two possibilities here:
3511 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3512 * approximate range of 100 to 1024. That means that we really need to
3513 * make sure that the qdisc can buffer that much data.
3515 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3516 * has finely granular timers and there's no need to fudge additional room
3517 * for buffers. (There's no extra effort needed to implement that: the
3518 * large 'buffer_hz' is used as a divisor, so practically any number will
3519 * come out as 0 in the division. Small integer results in the case of
3520 * really high dividends won't have any real effect anyhow.)
3522 static unsigned int buffer_hz;
3524 /* Returns tc handle 'major':'minor'. */
3526 tc_make_handle(unsigned int major, unsigned int minor)
3528 return TC_H_MAKE(major << 16, minor);
3531 /* Returns the major number from 'handle'. */
3533 tc_get_major(unsigned int handle)
3535 return TC_H_MAJ(handle) >> 16;
3538 /* Returns the minor number from 'handle'. */
3540 tc_get_minor(unsigned int handle)
3542 return TC_H_MIN(handle);
3545 static struct tcmsg *
3546 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3547 struct ofpbuf *request)
3549 struct tcmsg *tcmsg;
3553 error = get_ifindex(netdev, &ifindex);
3558 ofpbuf_init(request, 512);
3559 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3560 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3561 tcmsg->tcm_family = AF_UNSPEC;
3562 tcmsg->tcm_ifindex = ifindex;
3563 /* Caller should fill in tcmsg->tcm_handle. */
3564 /* Caller should fill in tcmsg->tcm_parent. */
3570 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3572 int error = nl_sock_transact(rtnl_sock, request, replyp);
3573 ofpbuf_uninit(request);
3580 /* The values in psched are not individually very meaningful, but they are
3581 * important. The tables below show some values seen in the wild.
3585 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3586 * (Before that, there are hints that it was 1000000000.)
3588 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3592 * -----------------------------------
3593 * [1] 000c8000 000f4240 000f4240 00000064
3594 * [2] 000003e8 00000400 000f4240 3b9aca00
3595 * [3] 000003e8 00000400 000f4240 3b9aca00
3596 * [4] 000003e8 00000400 000f4240 00000064
3597 * [5] 000003e8 00000040 000f4240 3b9aca00
3598 * [6] 000003e8 00000040 000f4240 000000f9
3600 * a b c d ticks_per_s buffer_hz
3601 * ------- --------- ---------- ------------- ----------- -------------
3602 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3603 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3604 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3605 * [4] 1,000 1,024 1,000,000 100 976,562 100
3606 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3607 * [6] 1,000 64 1,000,000 249 15,625,000 249
3609 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3610 * [2] 2.6.26-1-686-bigmem from Debian lenny
3611 * [3] 2.6.26-2-sparc64 from Debian lenny
3612 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3613 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3614 * [6] 2.6.34 from kernel.org on KVM
3616 static const char fn[] = "/proc/net/psched";
3617 unsigned int a, b, c, d;
3623 stream = fopen(fn, "r");
3625 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3629 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3630 VLOG_WARN("%s: read failed", fn);
3634 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3638 VLOG_WARN("%s: invalid scheduler parameters", fn);
3642 ticks_per_s = (double) a * c / b;
3646 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3649 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3652 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3653 * rate of 'rate' bytes per second. */
3655 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3660 return (rate * ticks) / ticks_per_s;
3663 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3664 * rate of 'rate' bytes per second. */
3666 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3671 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3674 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3675 * a transmission rate of 'rate' bytes per second. */
3677 tc_buffer_per_jiffy(unsigned int rate)
3682 return rate / buffer_hz;
3685 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3686 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3687 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3688 * stores NULL into it if it is absent.
3690 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3693 * Returns 0 if successful, otherwise a positive errno value. */
3695 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3696 struct nlattr **options)
3698 static const struct nl_policy tca_policy[] = {
3699 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3700 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3702 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3704 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3705 tca_policy, ta, ARRAY_SIZE(ta))) {
3706 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3711 *kind = nl_attr_get_string(ta[TCA_KIND]);
3715 *options = ta[TCA_OPTIONS];
3730 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3731 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3732 * into '*options', and its queue statistics into '*stats'. Any of the output
3733 * arguments may be null.
3735 * Returns 0 if successful, otherwise a positive errno value. */
3737 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3738 struct nlattr **options, struct netdev_queue_stats *stats)
3740 static const struct nl_policy tca_policy[] = {
3741 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3742 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3744 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3746 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3747 tca_policy, ta, ARRAY_SIZE(ta))) {
3748 VLOG_WARN_RL(&rl, "failed to parse class message");
3753 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3754 *handlep = tc->tcm_handle;
3758 *options = ta[TCA_OPTIONS];
3762 const struct gnet_stats_queue *gsq;
3763 struct gnet_stats_basic gsb;
3765 static const struct nl_policy stats_policy[] = {
3766 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3767 .min_len = sizeof gsb },
3768 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3769 .min_len = sizeof *gsq },
3771 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3773 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3774 sa, ARRAY_SIZE(sa))) {
3775 VLOG_WARN_RL(&rl, "failed to parse class stats");
3779 /* Alignment issues screw up the length of struct gnet_stats_basic on
3780 * some arch/bitsize combinations. Newer versions of Linux have a
3781 * struct gnet_stats_basic_packed, but we can't depend on that. The
3782 * easiest thing to do is just to make a copy. */
3783 memset(&gsb, 0, sizeof gsb);
3784 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3785 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3786 stats->tx_bytes = gsb.bytes;
3787 stats->tx_packets = gsb.packets;
3789 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3790 stats->tx_errors = gsq->drops;
3800 memset(stats, 0, sizeof *stats);
3805 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3808 tc_query_class(const struct netdev *netdev,
3809 unsigned int handle, unsigned int parent,
3810 struct ofpbuf **replyp)
3812 struct ofpbuf request;
3813 struct tcmsg *tcmsg;
3816 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3820 tcmsg->tcm_handle = handle;
3821 tcmsg->tcm_parent = parent;
3823 error = tc_transact(&request, replyp);
3825 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3826 netdev_get_name(netdev),
3827 tc_get_major(handle), tc_get_minor(handle),
3828 tc_get_major(parent), tc_get_minor(parent),
3834 /* Equivalent to "tc class del dev <name> handle <handle>". */
3836 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3838 struct ofpbuf request;
3839 struct tcmsg *tcmsg;
3842 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3846 tcmsg->tcm_handle = handle;
3847 tcmsg->tcm_parent = 0;
3849 error = tc_transact(&request, NULL);
3851 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3852 netdev_get_name(netdev),
3853 tc_get_major(handle), tc_get_minor(handle),
3859 /* Equivalent to "tc qdisc del dev <name> root". */
3861 tc_del_qdisc(struct netdev *netdev)
3863 struct netdev_dev_linux *netdev_dev =
3864 netdev_dev_linux_cast(netdev_get_dev(netdev));
3865 struct ofpbuf request;
3866 struct tcmsg *tcmsg;
3869 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3873 tcmsg->tcm_handle = tc_make_handle(1, 0);
3874 tcmsg->tcm_parent = TC_H_ROOT;
3876 error = tc_transact(&request, NULL);
3877 if (error == EINVAL) {
3878 /* EINVAL probably means that the default qdisc was in use, in which
3879 * case we've accomplished our purpose. */
3882 if (!error && netdev_dev->tc) {
3883 if (netdev_dev->tc->ops->tc_destroy) {
3884 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3886 netdev_dev->tc = NULL;
3891 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3892 * kernel to determine what they are. Returns 0 if successful, otherwise a
3893 * positive errno value. */
3895 tc_query_qdisc(const struct netdev *netdev)
3897 struct netdev_dev_linux *netdev_dev =
3898 netdev_dev_linux_cast(netdev_get_dev(netdev));
3899 struct ofpbuf request, *qdisc;
3900 const struct tc_ops *ops;
3901 struct tcmsg *tcmsg;
3905 if (netdev_dev->tc) {
3909 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3910 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3911 * 2.6.35 without that fix backported to it.
3913 * To avoid the OOPS, we must not make a request that would attempt to dump
3914 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3915 * few others. There are a few ways that I can see to do this, but most of
3916 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3917 * technique chosen here is to assume that any non-default qdisc that we
3918 * create will have a class with handle 1:0. The built-in qdiscs only have
3919 * a class with handle 0:0.
3921 * We could check for Linux 2.6.35+ and use a more straightforward method
3923 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3927 tcmsg->tcm_handle = tc_make_handle(1, 0);
3928 tcmsg->tcm_parent = 0;
3930 /* Figure out what tc class to instantiate. */
3931 error = tc_transact(&request, &qdisc);
3935 error = tc_parse_qdisc(qdisc, &kind, NULL);
3937 ops = &tc_ops_other;
3939 ops = tc_lookup_linux_name(kind);
3941 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3942 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3944 ops = &tc_ops_other;
3947 } else if (error == ENOENT) {
3948 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3949 * other entity that doesn't have a handle 1:0. We will assume
3950 * that it's the system default qdisc. */
3951 ops = &tc_ops_default;
3954 /* Who knows? Maybe the device got deleted. */
3955 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3956 netdev_get_name(netdev), strerror(error));
3957 ops = &tc_ops_other;
3960 /* Instantiate it. */
3961 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3962 assert((load_error == 0) == (netdev_dev->tc != NULL));
3963 ofpbuf_delete(qdisc);
3965 return error ? error : load_error;
3968 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3969 approximate the time to transmit packets of various lengths. For an MTU of
3970 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3971 represents two possible packet lengths; for a MTU of 513 through 1024, four
3972 possible lengths; and so on.
3974 Returns, for the specified 'mtu', the number of bits that packet lengths
3975 need to be shifted right to fit within such a 256-entry table. */
3977 tc_calc_cell_log(unsigned int mtu)
3982 mtu = ETH_PAYLOAD_MAX;
3984 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3986 for (cell_log = 0; mtu >= 256; cell_log++) {
3993 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3996 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3998 memset(rate, 0, sizeof *rate);
3999 rate->cell_log = tc_calc_cell_log(mtu);
4000 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4001 /* rate->cell_align = 0; */ /* distro headers. */
4002 rate->mpu = ETH_TOTAL_MIN;
4006 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4007 * attribute of the specified "type".
4009 * See tc_calc_cell_log() above for a description of "rtab"s. */
4011 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4016 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4017 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4018 unsigned packet_size = (i + 1) << rate->cell_log;
4019 if (packet_size < rate->mpu) {
4020 packet_size = rate->mpu;
4022 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4026 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4027 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4028 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4031 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4033 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4034 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4037 /* Copies 'src' into 'dst', performing format conversion in the process. */
4039 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4040 const struct rtnl_link_stats *src)
4042 dst->rx_packets = src->rx_packets;
4043 dst->tx_packets = src->tx_packets;
4044 dst->rx_bytes = src->rx_bytes;
4045 dst->tx_bytes = src->tx_bytes;
4046 dst->rx_errors = src->rx_errors;
4047 dst->tx_errors = src->tx_errors;
4048 dst->rx_dropped = src->rx_dropped;
4049 dst->tx_dropped = src->tx_dropped;
4050 dst->multicast = src->multicast;
4051 dst->collisions = src->collisions;
4052 dst->rx_length_errors = src->rx_length_errors;
4053 dst->rx_over_errors = src->rx_over_errors;
4054 dst->rx_crc_errors = src->rx_crc_errors;
4055 dst->rx_frame_errors = src->rx_frame_errors;
4056 dst->rx_fifo_errors = src->rx_fifo_errors;
4057 dst->rx_missed_errors = src->rx_missed_errors;
4058 dst->tx_aborted_errors = src->tx_aborted_errors;
4059 dst->tx_carrier_errors = src->tx_carrier_errors;
4060 dst->tx_fifo_errors = src->tx_fifo_errors;
4061 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4062 dst->tx_window_errors = src->tx_window_errors;
4066 /* Utility functions. */
4069 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4071 /* Policy for RTNLGRP_LINK messages.
4073 * There are *many* more fields in these messages, but currently we only
4074 * care about these fields. */
4075 static const struct nl_policy rtnlgrp_link_policy[] = {
4076 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4077 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4078 .min_len = sizeof(struct rtnl_link_stats) },
4081 struct ofpbuf request;
4082 struct ofpbuf *reply;
4083 struct ifinfomsg *ifi;
4084 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4087 ofpbuf_init(&request, 0);
4088 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4089 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4090 ifi->ifi_family = PF_UNSPEC;
4091 ifi->ifi_index = ifindex;
4092 error = nl_sock_transact(rtnl_sock, &request, &reply);
4093 ofpbuf_uninit(&request);
4098 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4099 rtnlgrp_link_policy,
4100 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4101 ofpbuf_delete(reply);
4105 if (!attrs[IFLA_STATS]) {
4106 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4107 ofpbuf_delete(reply);
4111 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4113 ofpbuf_delete(reply);
4119 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4121 static const char fn[] = "/proc/net/dev";
4126 stream = fopen(fn, "r");
4128 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4133 while (fgets(line, sizeof line, stream)) {
4136 #define X64 "%"SCNu64
4139 X64 X64 X64 X64 X64 X64 X64 "%*u"
4140 X64 X64 X64 X64 X64 X64 X64 "%*u",
4146 &stats->rx_fifo_errors,
4147 &stats->rx_frame_errors,
4153 &stats->tx_fifo_errors,
4155 &stats->tx_carrier_errors) != 15) {
4156 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4157 } else if (!strcmp(devname, netdev_name)) {
4158 stats->rx_length_errors = UINT64_MAX;
4159 stats->rx_over_errors = UINT64_MAX;
4160 stats->rx_crc_errors = UINT64_MAX;
4161 stats->rx_missed_errors = UINT64_MAX;
4162 stats->tx_aborted_errors = UINT64_MAX;
4163 stats->tx_heartbeat_errors = UINT64_MAX;
4164 stats->tx_window_errors = UINT64_MAX;
4170 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4176 get_flags(const struct netdev *netdev, int *flags)
4181 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4183 *flags = ifr.ifr_flags;
4188 set_flags(struct netdev *netdev, int flags)
4192 ifr.ifr_flags = flags;
4193 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4198 do_get_ifindex(const char *netdev_name)
4202 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4203 COVERAGE_INC(netdev_get_ifindex);
4204 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4205 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4206 netdev_name, strerror(errno));
4209 return ifr.ifr_ifindex;
4213 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4215 struct netdev_dev_linux *netdev_dev =
4216 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4218 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4219 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4223 netdev_dev->cache_valid |= VALID_IFINDEX;
4224 netdev_dev->ifindex = ifindex;
4226 *ifindexp = netdev_dev->ifindex;
4231 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4236 memset(&ifr, 0, sizeof ifr);
4237 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4238 COVERAGE_INC(netdev_get_hwaddr);
4239 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4240 /* ENODEV probably means that a vif disappeared asynchronously and
4241 * hasn't been removed from the database yet, so reduce the log level
4242 * to INFO for that case. */
4243 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4244 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4245 netdev_name, strerror(errno));
4248 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4249 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4250 VLOG_WARN("%s device has unknown hardware address family %d",
4251 netdev_name, hwaddr_family);
4253 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4258 set_etheraddr(const char *netdev_name, int hwaddr_family,
4259 const uint8_t mac[ETH_ADDR_LEN])
4263 memset(&ifr, 0, sizeof ifr);
4264 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4265 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4266 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4267 COVERAGE_INC(netdev_set_hwaddr);
4268 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4269 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4270 netdev_name, strerror(errno));
4277 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4278 int cmd, const char *cmd_name)
4282 memset(&ifr, 0, sizeof ifr);
4283 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4284 ifr.ifr_data = (caddr_t) ecmd;
4287 COVERAGE_INC(netdev_ethtool);
4288 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4291 if (errno != EOPNOTSUPP) {
4292 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4293 "failed: %s", cmd_name, name, strerror(errno));
4295 /* The device doesn't support this operation. That's pretty
4296 * common, so there's no point in logging anything. */
4302 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4303 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4305 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4306 const char *flag_name, bool enable)
4308 const char *netdev_name = netdev_get_name(netdev);
4309 struct ethtool_value evalue;
4313 memset(&evalue, 0, sizeof evalue);
4314 error = netdev_linux_do_ethtool(netdev_name,
4315 (struct ethtool_cmd *)&evalue,
4316 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4321 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4322 error = netdev_linux_do_ethtool(netdev_name,
4323 (struct ethtool_cmd *)&evalue,
4324 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4329 memset(&evalue, 0, sizeof evalue);
4330 error = netdev_linux_do_ethtool(netdev_name,
4331 (struct ethtool_cmd *)&evalue,
4332 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4337 if (new_flags != evalue.data) {
4338 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4339 "device %s failed", enable ? "enable" : "disable",
4340 flag_name, netdev_name);
4348 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4349 const char *cmd_name)
4351 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4352 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4353 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4361 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4362 int cmd, const char *cmd_name)
4367 ifr.ifr_addr.sa_family = AF_INET;
4368 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4370 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4371 *ip = sin->sin_addr;
4376 /* Returns an AF_PACKET raw socket or a negative errno value. */
4378 af_packet_sock(void)
4380 static int sock = INT_MIN;
4382 if (sock == INT_MIN) {
4383 sock = socket(AF_PACKET, SOCK_RAW, 0);
4385 set_nonblocking(sock);
4388 VLOG_ERR("failed to create packet socket: %s", strerror(errno));