2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_HAVE_VPORT_STATS = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
372 long long int carrier_resets;
373 uint32_t kbits_rate; /* Policing data. */
374 uint32_t kbits_burst;
375 bool have_vport_stats;
379 struct tap_state tap;
383 struct netdev_linux {
384 struct netdev netdev;
388 /* Sockets used for ioctl operations. */
389 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
391 /* A Netlink routing socket that is not subscribed to any multicast groups. */
392 static struct nl_sock *rtnl_sock;
394 /* This is set pretty low because we probably won't learn anything from the
395 * additional log messages. */
396 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
398 static int netdev_linux_init(void);
400 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
401 int cmd, const char *cmd_name);
402 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
403 const char *cmd_name);
404 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
405 int cmd, const char *cmd_name);
406 static int get_flags(const struct netdev *, int *flagsp);
407 static int set_flags(struct netdev *, int flags);
408 static int do_get_ifindex(const char *netdev_name);
409 static int get_ifindex(const struct netdev *, int *ifindexp);
410 static int do_set_addr(struct netdev *netdev,
411 int ioctl_nr, const char *ioctl_name,
412 struct in_addr addr);
413 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
414 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
415 const uint8_t[ETH_ADDR_LEN]);
416 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
417 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
418 static int get_carrier_via_sysfs(const char *name, bool *carrier);
419 static int af_packet_sock(void);
420 static void netdev_linux_miimon_run(void);
421 static void netdev_linux_miimon_wait(void);
424 is_netdev_linux_class(const struct netdev_class *netdev_class)
426 return netdev_class->init == netdev_linux_init;
429 static struct netdev_dev_linux *
430 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
432 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
433 assert(is_netdev_linux_class(netdev_class));
435 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
438 static struct netdev_linux *
439 netdev_linux_cast(const struct netdev *netdev)
441 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
442 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
443 assert(is_netdev_linux_class(netdev_class));
445 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
449 netdev_linux_init(void)
451 static int status = -1;
453 /* Create AF_INET socket. */
454 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
455 status = af_inet_sock >= 0 ? 0 : errno;
457 VLOG_ERR("failed to create inet socket: %s", strerror(status));
460 /* Create rtnetlink socket. */
462 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
464 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
473 netdev_linux_run(void)
475 rtnetlink_link_run();
476 netdev_linux_miimon_run();
480 netdev_linux_wait(void)
482 rtnetlink_link_wait();
483 netdev_linux_miimon_wait();
487 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
490 if (!dev->change_seq) {
493 dev->cache_valid = 0;
497 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
498 void *aux OVS_UNUSED)
500 struct netdev_dev_linux *dev;
502 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
504 const struct netdev_class *netdev_class =
505 netdev_dev_get_class(base_dev);
507 if (is_netdev_linux_class(netdev_class)) {
508 dev = netdev_dev_linux_cast(base_dev);
510 if (dev->carrier != change->running) {
511 dev->carrier = change->running;
512 dev->carrier_resets++;
515 netdev_dev_linux_changed(dev);
519 struct shash device_shash;
520 struct shash_node *node;
522 shash_init(&device_shash);
523 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
524 SHASH_FOR_EACH (node, &device_shash) {
529 get_carrier_via_sysfs(node->name, &carrier);
530 if (dev->carrier != carrier) {
531 dev->carrier = carrier;
532 dev->carrier_resets++;
535 netdev_dev_linux_changed(dev);
537 shash_destroy(&device_shash);
542 cache_notifier_ref(void)
544 if (!cache_notifier_refcount) {
545 assert(!netdev_linux_cache_notifier);
547 netdev_linux_cache_notifier =
548 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
550 if (!netdev_linux_cache_notifier) {
554 cache_notifier_refcount++;
560 cache_notifier_unref(void)
562 assert(cache_notifier_refcount > 0);
563 if (!--cache_notifier_refcount) {
564 assert(netdev_linux_cache_notifier);
565 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
566 netdev_linux_cache_notifier = NULL;
570 /* Creates system and internal devices. */
572 netdev_linux_create(const struct netdev_class *class, const char *name,
573 struct netdev_dev **netdev_devp)
575 struct netdev_dev_linux *netdev_dev;
578 error = cache_notifier_ref();
583 netdev_dev = xzalloc(sizeof *netdev_dev);
584 netdev_dev->change_seq = 1;
585 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
586 get_carrier_via_sysfs(name, &netdev_dev->carrier);
588 *netdev_devp = &netdev_dev->netdev_dev;
592 /* For most types of netdevs we open the device for each call of
593 * netdev_open(). However, this is not the case with tap devices,
594 * since it is only possible to open the device once. In this
595 * situation we share a single file descriptor, and consequently
596 * buffers, across all readers. Therefore once data is read it will
597 * be unavailable to other reads for tap devices. */
599 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
600 const char *name, struct netdev_dev **netdev_devp)
602 struct netdev_dev_linux *netdev_dev;
603 struct tap_state *state;
604 static const char tap_dev[] = "/dev/net/tun";
608 netdev_dev = xzalloc(sizeof *netdev_dev);
609 state = &netdev_dev->state.tap;
611 error = cache_notifier_ref();
616 /* Open tap device. */
617 state->fd = open(tap_dev, O_RDWR);
620 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
621 goto error_unref_notifier;
624 /* Create tap device. */
625 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
626 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
627 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
628 VLOG_WARN("%s: creating tap device failed: %s", name,
631 goto error_unref_notifier;
634 /* Make non-blocking. */
635 error = set_nonblocking(state->fd);
637 goto error_unref_notifier;
640 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
641 *netdev_devp = &netdev_dev->netdev_dev;
644 error_unref_notifier:
645 cache_notifier_unref();
652 destroy_tap(struct netdev_dev_linux *netdev_dev)
654 struct tap_state *state = &netdev_dev->state.tap;
656 if (state->fd >= 0) {
661 /* Destroys the netdev device 'netdev_dev_'. */
663 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
665 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
666 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
668 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
669 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
672 if (class == &netdev_tap_class) {
673 destroy_tap(netdev_dev);
677 cache_notifier_unref();
681 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
683 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
684 struct netdev_linux *netdev;
685 enum netdev_flags flags;
688 /* Allocate network device. */
689 netdev = xzalloc(sizeof *netdev);
691 netdev_init(&netdev->netdev, netdev_dev_);
693 /* Verify that the device really exists, by attempting to read its flags.
694 * (The flags might be cached, in which case this won't actually do an
697 * Don't do this for "internal" netdevs, though, because those have to be
698 * created as netdev objects before they exist in the kernel, because
699 * creating them in the kernel happens by passing a netdev object to
700 * dpif_port_add(). */
701 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
702 error = netdev_get_flags(&netdev->netdev, &flags);
703 if (error == ENODEV) {
708 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
709 !netdev_dev->state.tap.opened) {
711 /* We assume that the first user of the tap device is the primary user
712 * and give them the tap FD. Subsequent users probably just expect
713 * this to be a system device so open it normally to avoid send/receive
714 * directions appearing to be reversed. */
715 netdev->fd = netdev_dev->state.tap.fd;
716 netdev_dev->state.tap.opened = true;
719 *netdevp = &netdev->netdev;
723 netdev_uninit(&netdev->netdev, true);
727 /* Closes and destroys 'netdev'. */
729 netdev_linux_close(struct netdev *netdev_)
731 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
733 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
740 netdev_linux_listen(struct netdev *netdev_)
742 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
743 struct sockaddr_ll sll;
748 if (netdev->fd >= 0) {
752 /* Create file descriptor. */
753 fd = socket(PF_PACKET, SOCK_RAW, 0);
756 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
760 /* Set non-blocking mode. */
761 error = set_nonblocking(fd);
766 /* Get ethernet device index. */
767 error = get_ifindex(&netdev->netdev, &ifindex);
772 /* Bind to specific ethernet device. */
773 memset(&sll, 0, sizeof sll);
774 sll.sll_family = AF_PACKET;
775 sll.sll_ifindex = ifindex;
776 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
777 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
779 VLOG_ERR("%s: failed to bind raw socket (%s)",
780 netdev_get_name(netdev_), strerror(error));
795 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
797 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799 if (netdev->fd < 0) {
800 /* Device is not listening. */
805 ssize_t retval = recv(netdev->fd, data, size, MSG_TRUNC);
807 return retval <= size ? retval : -EMSGSIZE;
808 } else if (errno != EINTR) {
809 if (errno != EAGAIN) {
810 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
811 strerror(errno), netdev_get_name(netdev_));
818 /* Registers with the poll loop to wake up from the next call to poll_block()
819 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
821 netdev_linux_recv_wait(struct netdev *netdev_)
823 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
824 if (netdev->fd >= 0) {
825 poll_fd_wait(netdev->fd, POLLIN);
829 /* Discards all packets waiting to be received from 'netdev'. */
831 netdev_linux_drain(struct netdev *netdev_)
833 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
834 if (netdev->fd < 0) {
836 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
838 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
839 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
843 drain_fd(netdev->fd, ifr.ifr_qlen);
846 return drain_rcvbuf(netdev->fd);
850 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
851 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
852 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
853 * the packet is too big or too small to transmit on the device.
855 * The caller retains ownership of 'buffer' in all cases.
857 * The kernel maintains a packet transmission queue, so the caller is not
858 * expected to do additional queuing of packets. */
860 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
862 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
866 if (netdev->fd < 0) {
867 /* Use our AF_PACKET socket to send to this device. */
868 struct sockaddr_ll sll;
875 sock = af_packet_sock();
880 error = get_ifindex(netdev_, &ifindex);
885 /* We don't bother setting most fields in sockaddr_ll because the
886 * kernel ignores them for SOCK_RAW. */
887 memset(&sll, 0, sizeof sll);
888 sll.sll_family = AF_PACKET;
889 sll.sll_ifindex = ifindex;
891 iov.iov_base = (void *) data;
895 msg.msg_namelen = sizeof sll;
898 msg.msg_control = NULL;
899 msg.msg_controllen = 0;
902 retval = sendmsg(sock, &msg, 0);
904 /* Use the netdev's own fd to send to this device. This is
905 * essential for tap devices, because packets sent to a tap device
906 * with an AF_PACKET socket will loop back to be *received* again
907 * on the tap device. */
908 retval = write(netdev->fd, data, size);
912 /* The Linux AF_PACKET implementation never blocks waiting for room
913 * for packets, instead returning ENOBUFS. Translate this into
914 * EAGAIN for the caller. */
915 if (errno == ENOBUFS) {
917 } else if (errno == EINTR) {
919 } else if (errno != EAGAIN) {
920 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
921 netdev_get_name(netdev_), strerror(errno));
924 } else if (retval != size) {
925 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
926 "%zu) on %s", retval, size, netdev_get_name(netdev_));
934 /* Registers with the poll loop to wake up from the next call to poll_block()
935 * when the packet transmission queue has sufficient room to transmit a packet
936 * with netdev_send().
938 * The kernel maintains a packet transmission queue, so the client is not
939 * expected to do additional queuing of packets. Thus, this function is
940 * unlikely to ever be used. It is included for completeness. */
942 netdev_linux_send_wait(struct netdev *netdev_)
944 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
945 if (netdev->fd < 0) {
947 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
948 poll_fd_wait(netdev->fd, POLLOUT);
950 /* TAP device always accepts packets.*/
951 poll_immediate_wake();
955 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
956 * otherwise a positive errno value. */
958 netdev_linux_set_etheraddr(struct netdev *netdev_,
959 const uint8_t mac[ETH_ADDR_LEN])
961 struct netdev_dev_linux *netdev_dev =
962 netdev_dev_linux_cast(netdev_get_dev(netdev_));
965 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
966 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
967 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
969 netdev_dev->cache_valid |= VALID_ETHERADDR;
970 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
978 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
979 * free the returned buffer. */
981 netdev_linux_get_etheraddr(const struct netdev *netdev_,
982 uint8_t mac[ETH_ADDR_LEN])
984 struct netdev_dev_linux *netdev_dev =
985 netdev_dev_linux_cast(netdev_get_dev(netdev_));
986 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
987 int error = get_etheraddr(netdev_get_name(netdev_),
988 netdev_dev->etheraddr);
992 netdev_dev->cache_valid |= VALID_ETHERADDR;
994 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
998 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
999 * in bytes, not including the hardware header; thus, this is typically 1500
1000 * bytes for Ethernet devices. */
1002 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1004 struct netdev_dev_linux *netdev_dev =
1005 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1006 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1010 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1011 SIOCGIFMTU, "SIOCGIFMTU");
1015 netdev_dev->mtu = ifr.ifr_mtu;
1016 netdev_dev->cache_valid |= VALID_MTU;
1018 *mtup = netdev_dev->mtu;
1022 /* Sets the maximum size of transmitted (MTU) for given device using linux
1023 * networking ioctl interface.
1026 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1028 struct netdev_dev_linux *netdev_dev =
1029 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1034 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1035 SIOCSIFMTU, "SIOCSIFMTU");
1040 netdev_dev->mtu = ifr.ifr_mtu;
1041 netdev_dev->cache_valid |= VALID_MTU;
1045 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1046 * On failure, returns a negative errno value. */
1048 netdev_linux_get_ifindex(const struct netdev *netdev)
1052 error = get_ifindex(netdev, &ifindex);
1053 return error ? -error : ifindex;
1057 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1059 struct netdev_dev_linux *netdev_dev =
1060 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1062 if (netdev_dev->miimon_interval > 0) {
1063 *carrier = netdev_dev->miimon;
1065 *carrier = netdev_dev->carrier;
1071 static long long int
1072 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1074 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1078 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1079 struct mii_ioctl_data *data)
1084 memset(&ifr, 0, sizeof ifr);
1085 memcpy(&ifr.ifr_data, data, sizeof *data);
1086 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1087 memcpy(data, &ifr.ifr_data, sizeof *data);
1093 netdev_linux_get_miimon(const char *name, bool *miimon)
1095 struct mii_ioctl_data data;
1100 memset(&data, 0, sizeof data);
1101 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1103 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1104 data.reg_num = MII_BMSR;
1105 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1109 *miimon = !!(data.val_out & BMSR_LSTATUS);
1111 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1114 struct ethtool_cmd ecmd;
1116 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1119 memset(&ecmd, 0, sizeof ecmd);
1120 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1123 struct ethtool_value eval;
1125 memcpy(&eval, &ecmd, sizeof eval);
1126 *miimon = !!eval.data;
1128 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1136 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1137 long long int interval)
1139 struct netdev_dev_linux *netdev_dev;
1141 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1143 interval = interval > 0 ? MAX(interval, 100) : 0;
1144 if (netdev_dev->miimon_interval != interval) {
1145 netdev_dev->miimon_interval = interval;
1146 timer_set_expired(&netdev_dev->miimon_timer);
1153 netdev_linux_miimon_run(void)
1155 struct shash device_shash;
1156 struct shash_node *node;
1158 shash_init(&device_shash);
1159 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1160 SHASH_FOR_EACH (node, &device_shash) {
1161 struct netdev_dev_linux *dev = node->data;
1164 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1168 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1169 if (miimon != dev->miimon) {
1170 dev->miimon = miimon;
1171 netdev_dev_linux_changed(dev);
1174 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1177 shash_destroy(&device_shash);
1181 netdev_linux_miimon_wait(void)
1183 struct shash device_shash;
1184 struct shash_node *node;
1186 shash_init(&device_shash);
1187 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1188 SHASH_FOR_EACH (node, &device_shash) {
1189 struct netdev_dev_linux *dev = node->data;
1191 if (dev->miimon_interval > 0) {
1192 timer_wait(&dev->miimon_timer);
1195 shash_destroy(&device_shash);
1198 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1199 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1202 check_for_working_netlink_stats(void)
1204 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1205 * preferable, so if that works, we'll use it. */
1206 int ifindex = do_get_ifindex("lo");
1208 VLOG_WARN("failed to get ifindex for lo, "
1209 "obtaining netdev stats from proc");
1212 struct netdev_stats stats;
1213 int error = get_stats_via_netlink(ifindex, &stats);
1215 VLOG_DBG("obtaining netdev stats via rtnetlink");
1218 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1219 "via proc (you are probably running a pre-2.6.19 "
1220 "kernel)", strerror(error));
1227 swap_uint64(uint64_t *a, uint64_t *b)
1235 get_stats_via_vport(const struct netdev *netdev_,
1236 struct netdev_stats *stats)
1238 struct netdev_dev_linux *netdev_dev =
1239 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1241 if (netdev_dev->have_vport_stats ||
1242 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1245 error = netdev_vport_get_stats(netdev_, stats);
1247 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1248 "(%s)", netdev_get_name(netdev_), strerror(error));
1250 netdev_dev->have_vport_stats = !error;
1251 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1256 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1257 struct netdev_stats *stats)
1259 static int use_netlink_stats = -1;
1262 if (use_netlink_stats < 0) {
1263 use_netlink_stats = check_for_working_netlink_stats();
1266 if (use_netlink_stats) {
1269 error = get_ifindex(netdev_, &ifindex);
1271 error = get_stats_via_netlink(ifindex, stats);
1274 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1278 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1279 netdev_get_name(netdev_), error);
1285 /* Retrieves current device stats for 'netdev-linux'. */
1287 netdev_linux_get_stats(const struct netdev *netdev_,
1288 struct netdev_stats *stats)
1290 struct netdev_dev_linux *netdev_dev =
1291 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1292 struct netdev_stats dev_stats;
1295 get_stats_via_vport(netdev_, stats);
1297 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1300 if (!netdev_dev->have_vport_stats) {
1307 if (!netdev_dev->have_vport_stats) {
1308 /* stats not available from OVS then use ioctl stats. */
1311 stats->rx_errors += dev_stats.rx_errors;
1312 stats->tx_errors += dev_stats.tx_errors;
1313 stats->rx_dropped += dev_stats.rx_dropped;
1314 stats->tx_dropped += dev_stats.tx_dropped;
1315 stats->multicast += dev_stats.multicast;
1316 stats->collisions += dev_stats.collisions;
1317 stats->rx_length_errors += dev_stats.rx_length_errors;
1318 stats->rx_over_errors += dev_stats.rx_over_errors;
1319 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1320 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1321 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1322 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1323 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1324 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1325 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1326 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1327 stats->tx_window_errors += dev_stats.tx_window_errors;
1332 /* Retrieves current device stats for 'netdev-tap' netdev or
1333 * netdev-internal. */
1335 netdev_pseudo_get_stats(const struct netdev *netdev_,
1336 struct netdev_stats *stats)
1338 struct netdev_dev_linux *netdev_dev =
1339 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1340 struct netdev_stats dev_stats;
1343 get_stats_via_vport(netdev_, stats);
1345 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1347 if (!netdev_dev->have_vport_stats) {
1354 /* If this port is an internal port then the transmit and receive stats
1355 * will appear to be swapped relative to the other ports since we are the
1356 * one sending the data, not a remote computer. For consistency, we swap
1357 * them back here. This does not apply if we are getting stats from the
1358 * vport layer because it always tracks stats from the perspective of the
1360 if (!netdev_dev->have_vport_stats) {
1362 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1363 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1364 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1365 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1366 stats->rx_length_errors = 0;
1367 stats->rx_over_errors = 0;
1368 stats->rx_crc_errors = 0;
1369 stats->rx_frame_errors = 0;
1370 stats->rx_fifo_errors = 0;
1371 stats->rx_missed_errors = 0;
1372 stats->tx_aborted_errors = 0;
1373 stats->tx_carrier_errors = 0;
1374 stats->tx_fifo_errors = 0;
1375 stats->tx_heartbeat_errors = 0;
1376 stats->tx_window_errors = 0;
1378 stats->rx_dropped += dev_stats.tx_dropped;
1379 stats->tx_dropped += dev_stats.rx_dropped;
1381 stats->rx_errors += dev_stats.tx_errors;
1382 stats->tx_errors += dev_stats.rx_errors;
1384 stats->multicast += dev_stats.multicast;
1385 stats->collisions += dev_stats.collisions;
1390 /* Stores the features supported by 'netdev' into each of '*current',
1391 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1392 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1393 * successful, otherwise a positive errno value. */
1395 netdev_linux_get_features(const struct netdev *netdev,
1396 uint32_t *current, uint32_t *advertised,
1397 uint32_t *supported, uint32_t *peer)
1399 struct ethtool_cmd ecmd;
1402 memset(&ecmd, 0, sizeof ecmd);
1403 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1404 ETHTOOL_GSET, "ETHTOOL_GSET");
1409 /* Supported features. */
1411 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1412 *supported |= OFPPF_10MB_HD;
1414 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1415 *supported |= OFPPF_10MB_FD;
1417 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1418 *supported |= OFPPF_100MB_HD;
1420 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1421 *supported |= OFPPF_100MB_FD;
1423 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1424 *supported |= OFPPF_1GB_HD;
1426 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1427 *supported |= OFPPF_1GB_FD;
1429 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1430 *supported |= OFPPF_10GB_FD;
1432 if (ecmd.supported & SUPPORTED_TP) {
1433 *supported |= OFPPF_COPPER;
1435 if (ecmd.supported & SUPPORTED_FIBRE) {
1436 *supported |= OFPPF_FIBER;
1438 if (ecmd.supported & SUPPORTED_Autoneg) {
1439 *supported |= OFPPF_AUTONEG;
1441 if (ecmd.supported & SUPPORTED_Pause) {
1442 *supported |= OFPPF_PAUSE;
1444 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1445 *supported |= OFPPF_PAUSE_ASYM;
1448 /* Advertised features. */
1450 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1451 *advertised |= OFPPF_10MB_HD;
1453 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1454 *advertised |= OFPPF_10MB_FD;
1456 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1457 *advertised |= OFPPF_100MB_HD;
1459 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1460 *advertised |= OFPPF_100MB_FD;
1462 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1463 *advertised |= OFPPF_1GB_HD;
1465 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1466 *advertised |= OFPPF_1GB_FD;
1468 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1469 *advertised |= OFPPF_10GB_FD;
1471 if (ecmd.advertising & ADVERTISED_TP) {
1472 *advertised |= OFPPF_COPPER;
1474 if (ecmd.advertising & ADVERTISED_FIBRE) {
1475 *advertised |= OFPPF_FIBER;
1477 if (ecmd.advertising & ADVERTISED_Autoneg) {
1478 *advertised |= OFPPF_AUTONEG;
1480 if (ecmd.advertising & ADVERTISED_Pause) {
1481 *advertised |= OFPPF_PAUSE;
1483 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1484 *advertised |= OFPPF_PAUSE_ASYM;
1487 /* Current settings. */
1488 if (ecmd.speed == SPEED_10) {
1489 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1490 } else if (ecmd.speed == SPEED_100) {
1491 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1492 } else if (ecmd.speed == SPEED_1000) {
1493 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1494 } else if (ecmd.speed == SPEED_10000) {
1495 *current = OFPPF_10GB_FD;
1500 if (ecmd.port == PORT_TP) {
1501 *current |= OFPPF_COPPER;
1502 } else if (ecmd.port == PORT_FIBRE) {
1503 *current |= OFPPF_FIBER;
1507 *current |= OFPPF_AUTONEG;
1510 /* Peer advertisements. */
1511 *peer = 0; /* XXX */
1516 /* Set the features advertised by 'netdev' to 'advertise'. */
1518 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1520 struct ethtool_cmd ecmd;
1523 memset(&ecmd, 0, sizeof ecmd);
1524 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1525 ETHTOOL_GSET, "ETHTOOL_GSET");
1530 ecmd.advertising = 0;
1531 if (advertise & OFPPF_10MB_HD) {
1532 ecmd.advertising |= ADVERTISED_10baseT_Half;
1534 if (advertise & OFPPF_10MB_FD) {
1535 ecmd.advertising |= ADVERTISED_10baseT_Full;
1537 if (advertise & OFPPF_100MB_HD) {
1538 ecmd.advertising |= ADVERTISED_100baseT_Half;
1540 if (advertise & OFPPF_100MB_FD) {
1541 ecmd.advertising |= ADVERTISED_100baseT_Full;
1543 if (advertise & OFPPF_1GB_HD) {
1544 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1546 if (advertise & OFPPF_1GB_FD) {
1547 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1549 if (advertise & OFPPF_10GB_FD) {
1550 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1552 if (advertise & OFPPF_COPPER) {
1553 ecmd.advertising |= ADVERTISED_TP;
1555 if (advertise & OFPPF_FIBER) {
1556 ecmd.advertising |= ADVERTISED_FIBRE;
1558 if (advertise & OFPPF_AUTONEG) {
1559 ecmd.advertising |= ADVERTISED_Autoneg;
1561 if (advertise & OFPPF_PAUSE) {
1562 ecmd.advertising |= ADVERTISED_Pause;
1564 if (advertise & OFPPF_PAUSE_ASYM) {
1565 ecmd.advertising |= ADVERTISED_Asym_Pause;
1567 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1568 ETHTOOL_SSET, "ETHTOOL_SSET");
1571 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1572 * successful, otherwise a positive errno value. */
1574 netdev_linux_set_policing(struct netdev *netdev,
1575 uint32_t kbits_rate, uint32_t kbits_burst)
1577 struct netdev_dev_linux *netdev_dev =
1578 netdev_dev_linux_cast(netdev_get_dev(netdev));
1579 const char *netdev_name = netdev_get_name(netdev);
1582 COVERAGE_INC(netdev_set_policing);
1584 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1585 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1586 : kbits_burst); /* Stick with user-specified value. */
1588 if (netdev_dev->cache_valid & VALID_POLICING
1589 && netdev_dev->kbits_rate == kbits_rate
1590 && netdev_dev->kbits_burst == kbits_burst) {
1591 /* Assume that settings haven't changed since we last set them. */
1595 /* Remove any existing ingress qdisc. */
1596 error = tc_add_del_ingress_qdisc(netdev, false);
1598 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1599 netdev_name, strerror(error));
1604 error = tc_add_del_ingress_qdisc(netdev, true);
1606 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1607 netdev_name, strerror(error));
1611 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1613 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1614 netdev_name, strerror(error));
1619 netdev_dev->kbits_rate = kbits_rate;
1620 netdev_dev->kbits_burst = kbits_burst;
1621 netdev_dev->cache_valid |= VALID_POLICING;
1627 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1630 const struct tc_ops **opsp;
1632 for (opsp = tcs; *opsp != NULL; opsp++) {
1633 const struct tc_ops *ops = *opsp;
1634 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1635 sset_add(types, ops->ovs_name);
1641 static const struct tc_ops *
1642 tc_lookup_ovs_name(const char *name)
1644 const struct tc_ops **opsp;
1646 for (opsp = tcs; *opsp != NULL; opsp++) {
1647 const struct tc_ops *ops = *opsp;
1648 if (!strcmp(name, ops->ovs_name)) {
1655 static const struct tc_ops *
1656 tc_lookup_linux_name(const char *name)
1658 const struct tc_ops **opsp;
1660 for (opsp = tcs; *opsp != NULL; opsp++) {
1661 const struct tc_ops *ops = *opsp;
1662 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1669 static struct tc_queue *
1670 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1673 struct netdev_dev_linux *netdev_dev =
1674 netdev_dev_linux_cast(netdev_get_dev(netdev));
1675 struct tc_queue *queue;
1677 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1678 if (queue->queue_id == queue_id) {
1685 static struct tc_queue *
1686 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1688 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1692 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1694 struct netdev_qos_capabilities *caps)
1696 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1700 caps->n_queues = ops->n_queues;
1705 netdev_linux_get_qos(const struct netdev *netdev,
1706 const char **typep, struct shash *details)
1708 struct netdev_dev_linux *netdev_dev =
1709 netdev_dev_linux_cast(netdev_get_dev(netdev));
1712 error = tc_query_qdisc(netdev);
1717 *typep = netdev_dev->tc->ops->ovs_name;
1718 return (netdev_dev->tc->ops->qdisc_get
1719 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1724 netdev_linux_set_qos(struct netdev *netdev,
1725 const char *type, const struct shash *details)
1727 struct netdev_dev_linux *netdev_dev =
1728 netdev_dev_linux_cast(netdev_get_dev(netdev));
1729 const struct tc_ops *new_ops;
1732 new_ops = tc_lookup_ovs_name(type);
1733 if (!new_ops || !new_ops->tc_install) {
1737 error = tc_query_qdisc(netdev);
1742 if (new_ops == netdev_dev->tc->ops) {
1743 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1745 /* Delete existing qdisc. */
1746 error = tc_del_qdisc(netdev);
1750 assert(netdev_dev->tc == NULL);
1752 /* Install new qdisc. */
1753 error = new_ops->tc_install(netdev, details);
1754 assert((error == 0) == (netdev_dev->tc != NULL));
1761 netdev_linux_get_queue(const struct netdev *netdev,
1762 unsigned int queue_id, struct shash *details)
1764 struct netdev_dev_linux *netdev_dev =
1765 netdev_dev_linux_cast(netdev_get_dev(netdev));
1768 error = tc_query_qdisc(netdev);
1772 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1774 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1780 netdev_linux_set_queue(struct netdev *netdev,
1781 unsigned int queue_id, const struct shash *details)
1783 struct netdev_dev_linux *netdev_dev =
1784 netdev_dev_linux_cast(netdev_get_dev(netdev));
1787 error = tc_query_qdisc(netdev);
1790 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1791 || !netdev_dev->tc->ops->class_set) {
1795 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1799 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1801 struct netdev_dev_linux *netdev_dev =
1802 netdev_dev_linux_cast(netdev_get_dev(netdev));
1805 error = tc_query_qdisc(netdev);
1808 } else if (!netdev_dev->tc->ops->class_delete) {
1811 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1813 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1819 netdev_linux_get_queue_stats(const struct netdev *netdev,
1820 unsigned int queue_id,
1821 struct netdev_queue_stats *stats)
1823 struct netdev_dev_linux *netdev_dev =
1824 netdev_dev_linux_cast(netdev_get_dev(netdev));
1827 error = tc_query_qdisc(netdev);
1830 } else if (!netdev_dev->tc->ops->class_get_stats) {
1833 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1835 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1841 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1843 struct ofpbuf request;
1844 struct tcmsg *tcmsg;
1846 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1850 tcmsg->tcm_parent = 0;
1851 nl_dump_start(dump, rtnl_sock, &request);
1852 ofpbuf_uninit(&request);
1857 netdev_linux_dump_queues(const struct netdev *netdev,
1858 netdev_dump_queues_cb *cb, void *aux)
1860 struct netdev_dev_linux *netdev_dev =
1861 netdev_dev_linux_cast(netdev_get_dev(netdev));
1862 struct tc_queue *queue;
1863 struct shash details;
1867 error = tc_query_qdisc(netdev);
1870 } else if (!netdev_dev->tc->ops->class_get) {
1875 shash_init(&details);
1876 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1877 shash_clear(&details);
1879 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1881 (*cb)(queue->queue_id, &details, aux);
1886 shash_destroy(&details);
1892 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1893 netdev_dump_queue_stats_cb *cb, void *aux)
1895 struct netdev_dev_linux *netdev_dev =
1896 netdev_dev_linux_cast(netdev_get_dev(netdev));
1897 struct nl_dump dump;
1902 error = tc_query_qdisc(netdev);
1905 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1910 if (!start_queue_dump(netdev, &dump)) {
1913 while (nl_dump_next(&dump, &msg)) {
1914 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1920 error = nl_dump_done(&dump);
1921 return error ? error : last_error;
1925 netdev_linux_get_in4(const struct netdev *netdev_,
1926 struct in_addr *address, struct in_addr *netmask)
1928 struct netdev_dev_linux *netdev_dev =
1929 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1931 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1934 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1935 SIOCGIFADDR, "SIOCGIFADDR");
1940 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1941 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1946 netdev_dev->cache_valid |= VALID_IN4;
1948 *address = netdev_dev->address;
1949 *netmask = netdev_dev->netmask;
1950 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1954 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1955 struct in_addr netmask)
1957 struct netdev_dev_linux *netdev_dev =
1958 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1961 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1963 netdev_dev->cache_valid |= VALID_IN4;
1964 netdev_dev->address = address;
1965 netdev_dev->netmask = netmask;
1966 if (address.s_addr != INADDR_ANY) {
1967 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1968 "SIOCSIFNETMASK", netmask);
1975 parse_if_inet6_line(const char *line,
1976 struct in6_addr *in6, char ifname[16 + 1])
1978 uint8_t *s6 = in6->s6_addr;
1979 #define X8 "%2"SCNx8
1981 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1982 "%*x %*x %*x %*x %16s\n",
1983 &s6[0], &s6[1], &s6[2], &s6[3],
1984 &s6[4], &s6[5], &s6[6], &s6[7],
1985 &s6[8], &s6[9], &s6[10], &s6[11],
1986 &s6[12], &s6[13], &s6[14], &s6[15],
1990 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1991 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1993 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1995 struct netdev_dev_linux *netdev_dev =
1996 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1997 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2001 netdev_dev->in6 = in6addr_any;
2003 file = fopen("/proc/net/if_inet6", "r");
2005 const char *name = netdev_get_name(netdev_);
2006 while (fgets(line, sizeof line, file)) {
2007 struct in6_addr in6_tmp;
2008 char ifname[16 + 1];
2009 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2010 && !strcmp(name, ifname))
2012 netdev_dev->in6 = in6_tmp;
2018 netdev_dev->cache_valid |= VALID_IN6;
2020 *in6 = netdev_dev->in6;
2025 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2027 struct sockaddr_in sin;
2028 memset(&sin, 0, sizeof sin);
2029 sin.sin_family = AF_INET;
2030 sin.sin_addr = addr;
2033 memset(sa, 0, sizeof *sa);
2034 memcpy(sa, &sin, sizeof sin);
2038 do_set_addr(struct netdev *netdev,
2039 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2042 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2043 make_in4_sockaddr(&ifr.ifr_addr, addr);
2045 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2049 /* Adds 'router' as a default IP gateway. */
2051 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2053 struct in_addr any = { INADDR_ANY };
2057 memset(&rt, 0, sizeof rt);
2058 make_in4_sockaddr(&rt.rt_dst, any);
2059 make_in4_sockaddr(&rt.rt_gateway, router);
2060 make_in4_sockaddr(&rt.rt_genmask, any);
2061 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2062 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2064 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2070 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2073 static const char fn[] = "/proc/net/route";
2078 *netdev_name = NULL;
2079 stream = fopen(fn, "r");
2080 if (stream == NULL) {
2081 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2086 while (fgets(line, sizeof line, stream)) {
2089 ovs_be32 dest, gateway, mask;
2090 int refcnt, metric, mtu;
2091 unsigned int flags, use, window, irtt;
2094 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2096 iface, &dest, &gateway, &flags, &refcnt,
2097 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2099 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2103 if (!(flags & RTF_UP)) {
2104 /* Skip routes that aren't up. */
2108 /* The output of 'dest', 'mask', and 'gateway' were given in
2109 * network byte order, so we don't need need any endian
2110 * conversions here. */
2111 if ((dest & mask) == (host->s_addr & mask)) {
2113 /* The host is directly reachable. */
2114 next_hop->s_addr = 0;
2116 /* To reach the host, we must go through a gateway. */
2117 next_hop->s_addr = gateway;
2119 *netdev_name = xstrdup(iface);
2131 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2133 struct ethtool_drvinfo drvinfo;
2136 memset(&drvinfo, 0, sizeof drvinfo);
2137 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2138 (struct ethtool_cmd *)&drvinfo,
2140 "ETHTOOL_GDRVINFO");
2142 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2143 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2144 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2150 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2151 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2152 * returns 0. Otherwise, it returns a positive errno value; in particular,
2153 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2155 netdev_linux_arp_lookup(const struct netdev *netdev,
2156 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2159 struct sockaddr_in sin;
2162 memset(&r, 0, sizeof r);
2163 memset(&sin, 0, sizeof sin);
2164 sin.sin_family = AF_INET;
2165 sin.sin_addr.s_addr = ip;
2167 memcpy(&r.arp_pa, &sin, sizeof sin);
2168 r.arp_ha.sa_family = ARPHRD_ETHER;
2170 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2171 COVERAGE_INC(netdev_arp_lookup);
2172 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2174 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2175 } else if (retval != ENXIO) {
2176 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2177 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2183 nd_to_iff_flags(enum netdev_flags nd)
2186 if (nd & NETDEV_UP) {
2189 if (nd & NETDEV_PROMISC) {
2196 iff_to_nd_flags(int iff)
2198 enum netdev_flags nd = 0;
2202 if (iff & IFF_PROMISC) {
2203 nd |= NETDEV_PROMISC;
2209 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2210 enum netdev_flags on, enum netdev_flags *old_flagsp)
2212 int old_flags, new_flags;
2215 error = get_flags(netdev, &old_flags);
2217 *old_flagsp = iff_to_nd_flags(old_flags);
2218 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2219 if (new_flags != old_flags) {
2220 error = set_flags(netdev, new_flags);
2227 netdev_linux_change_seq(const struct netdev *netdev)
2229 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2232 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2236 netdev_linux_init, \
2238 netdev_linux_wait, \
2241 netdev_linux_destroy, \
2242 NULL, /* get_config */ \
2243 NULL, /* set_config */ \
2245 netdev_linux_open, \
2246 netdev_linux_close, \
2248 netdev_linux_listen, \
2249 netdev_linux_recv, \
2250 netdev_linux_recv_wait, \
2251 netdev_linux_drain, \
2253 netdev_linux_send, \
2254 netdev_linux_send_wait, \
2256 netdev_linux_set_etheraddr, \
2257 netdev_linux_get_etheraddr, \
2258 netdev_linux_get_mtu, \
2259 netdev_linux_set_mtu, \
2260 netdev_linux_get_ifindex, \
2261 netdev_linux_get_carrier, \
2262 netdev_linux_get_carrier_resets, \
2263 netdev_linux_set_miimon_interval, \
2267 netdev_linux_get_features, \
2268 netdev_linux_set_advertisements, \
2270 netdev_linux_set_policing, \
2271 netdev_linux_get_qos_types, \
2272 netdev_linux_get_qos_capabilities, \
2273 netdev_linux_get_qos, \
2274 netdev_linux_set_qos, \
2275 netdev_linux_get_queue, \
2276 netdev_linux_set_queue, \
2277 netdev_linux_delete_queue, \
2278 netdev_linux_get_queue_stats, \
2279 netdev_linux_dump_queues, \
2280 netdev_linux_dump_queue_stats, \
2282 netdev_linux_get_in4, \
2283 netdev_linux_set_in4, \
2284 netdev_linux_get_in6, \
2285 netdev_linux_add_router, \
2286 netdev_linux_get_next_hop, \
2287 netdev_linux_get_status, \
2288 netdev_linux_arp_lookup, \
2290 netdev_linux_update_flags, \
2292 netdev_linux_change_seq \
2295 const struct netdev_class netdev_linux_class =
2298 netdev_linux_create,
2299 netdev_linux_get_stats,
2300 NULL); /* set_stats */
2302 const struct netdev_class netdev_tap_class =
2305 netdev_linux_create_tap,
2306 netdev_pseudo_get_stats,
2307 NULL); /* set_stats */
2309 const struct netdev_class netdev_internal_class =
2312 netdev_linux_create,
2313 netdev_pseudo_get_stats,
2314 netdev_vport_set_stats);
2316 /* HTB traffic control class. */
2318 #define HTB_N_QUEUES 0xf000
2322 unsigned int max_rate; /* In bytes/s. */
2326 struct tc_queue tc_queue;
2327 unsigned int min_rate; /* In bytes/s. */
2328 unsigned int max_rate; /* In bytes/s. */
2329 unsigned int burst; /* In bytes. */
2330 unsigned int priority; /* Lower values are higher priorities. */
2334 htb_get__(const struct netdev *netdev)
2336 struct netdev_dev_linux *netdev_dev =
2337 netdev_dev_linux_cast(netdev_get_dev(netdev));
2338 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2342 htb_install__(struct netdev *netdev, uint64_t max_rate)
2344 struct netdev_dev_linux *netdev_dev =
2345 netdev_dev_linux_cast(netdev_get_dev(netdev));
2348 htb = xmalloc(sizeof *htb);
2349 tc_init(&htb->tc, &tc_ops_htb);
2350 htb->max_rate = max_rate;
2352 netdev_dev->tc = &htb->tc;
2355 /* Create an HTB qdisc.
2357 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2359 htb_setup_qdisc__(struct netdev *netdev)
2362 struct tc_htb_glob opt;
2363 struct ofpbuf request;
2364 struct tcmsg *tcmsg;
2366 tc_del_qdisc(netdev);
2368 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2369 NLM_F_EXCL | NLM_F_CREATE, &request);
2373 tcmsg->tcm_handle = tc_make_handle(1, 0);
2374 tcmsg->tcm_parent = TC_H_ROOT;
2376 nl_msg_put_string(&request, TCA_KIND, "htb");
2378 memset(&opt, 0, sizeof opt);
2379 opt.rate2quantum = 10;
2383 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2384 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2385 nl_msg_end_nested(&request, opt_offset);
2387 return tc_transact(&request, NULL);
2390 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2391 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2393 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2394 unsigned int parent, struct htb_class *class)
2397 struct tc_htb_opt opt;
2398 struct ofpbuf request;
2399 struct tcmsg *tcmsg;
2403 error = netdev_get_mtu(netdev, &mtu);
2405 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2406 netdev_get_name(netdev));
2410 memset(&opt, 0, sizeof opt);
2411 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2412 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2413 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2414 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2415 opt.prio = class->priority;
2417 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2421 tcmsg->tcm_handle = handle;
2422 tcmsg->tcm_parent = parent;
2424 nl_msg_put_string(&request, TCA_KIND, "htb");
2425 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2426 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2427 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2428 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2429 nl_msg_end_nested(&request, opt_offset);
2431 error = tc_transact(&request, NULL);
2433 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2434 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2435 netdev_get_name(netdev),
2436 tc_get_major(handle), tc_get_minor(handle),
2437 tc_get_major(parent), tc_get_minor(parent),
2438 class->min_rate, class->max_rate,
2439 class->burst, class->priority, strerror(error));
2444 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2445 * description of them into 'details'. The description complies with the
2446 * specification given in the vswitch database documentation for linux-htb
2449 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2451 static const struct nl_policy tca_htb_policy[] = {
2452 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2453 .min_len = sizeof(struct tc_htb_opt) },
2456 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2457 const struct tc_htb_opt *htb;
2459 if (!nl_parse_nested(nl_options, tca_htb_policy,
2460 attrs, ARRAY_SIZE(tca_htb_policy))) {
2461 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2465 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2466 class->min_rate = htb->rate.rate;
2467 class->max_rate = htb->ceil.rate;
2468 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2469 class->priority = htb->prio;
2474 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2475 struct htb_class *options,
2476 struct netdev_queue_stats *stats)
2478 struct nlattr *nl_options;
2479 unsigned int handle;
2482 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2483 if (!error && queue_id) {
2484 unsigned int major = tc_get_major(handle);
2485 unsigned int minor = tc_get_minor(handle);
2486 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2487 *queue_id = minor - 1;
2492 if (!error && options) {
2493 error = htb_parse_tca_options__(nl_options, options);
2499 htb_parse_qdisc_details__(struct netdev *netdev,
2500 const struct shash *details, struct htb_class *hc)
2502 const char *max_rate_s;
2504 max_rate_s = shash_find_data(details, "max-rate");
2505 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2506 if (!hc->max_rate) {
2509 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2510 hc->max_rate = netdev_features_to_bps(current) / 8;
2512 hc->min_rate = hc->max_rate;
2518 htb_parse_class_details__(struct netdev *netdev,
2519 const struct shash *details, struct htb_class *hc)
2521 const struct htb *htb = htb_get__(netdev);
2522 const char *min_rate_s = shash_find_data(details, "min-rate");
2523 const char *max_rate_s = shash_find_data(details, "max-rate");
2524 const char *burst_s = shash_find_data(details, "burst");
2525 const char *priority_s = shash_find_data(details, "priority");
2528 error = netdev_get_mtu(netdev, &mtu);
2530 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2531 netdev_get_name(netdev));
2535 /* HTB requires at least an mtu sized min-rate to send any traffic even
2536 * on uncongested links. */
2537 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2538 hc->min_rate = MAX(hc->min_rate, mtu);
2539 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2542 hc->max_rate = (max_rate_s
2543 ? strtoull(max_rate_s, NULL, 10) / 8
2545 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2546 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2550 * According to hints in the documentation that I've read, it is important
2551 * that 'burst' be at least as big as the largest frame that might be
2552 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2553 * but having it a bit too small is a problem. Since netdev_get_mtu()
2554 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2555 * the MTU. We actually add 64, instead of 14, as a guard against
2556 * additional headers get tacked on somewhere that we're not aware of. */
2557 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2558 hc->burst = MAX(hc->burst, mtu + 64);
2561 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2567 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2568 unsigned int parent, struct htb_class *options,
2569 struct netdev_queue_stats *stats)
2571 struct ofpbuf *reply;
2574 error = tc_query_class(netdev, handle, parent, &reply);
2576 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2577 ofpbuf_delete(reply);
2583 htb_tc_install(struct netdev *netdev, const struct shash *details)
2587 error = htb_setup_qdisc__(netdev);
2589 struct htb_class hc;
2591 htb_parse_qdisc_details__(netdev, details, &hc);
2592 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2593 tc_make_handle(1, 0), &hc);
2595 htb_install__(netdev, hc.max_rate);
2601 static struct htb_class *
2602 htb_class_cast__(const struct tc_queue *queue)
2604 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2608 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2609 const struct htb_class *hc)
2611 struct htb *htb = htb_get__(netdev);
2612 size_t hash = hash_int(queue_id, 0);
2613 struct tc_queue *queue;
2614 struct htb_class *hcp;
2616 queue = tc_find_queue__(netdev, queue_id, hash);
2618 hcp = htb_class_cast__(queue);
2620 hcp = xmalloc(sizeof *hcp);
2621 queue = &hcp->tc_queue;
2622 queue->queue_id = queue_id;
2623 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2626 hcp->min_rate = hc->min_rate;
2627 hcp->max_rate = hc->max_rate;
2628 hcp->burst = hc->burst;
2629 hcp->priority = hc->priority;
2633 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2636 struct nl_dump dump;
2637 struct htb_class hc;
2639 /* Get qdisc options. */
2641 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2642 htb_install__(netdev, hc.max_rate);
2645 if (!start_queue_dump(netdev, &dump)) {
2648 while (nl_dump_next(&dump, &msg)) {
2649 unsigned int queue_id;
2651 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2652 htb_update_queue__(netdev, queue_id, &hc);
2655 nl_dump_done(&dump);
2661 htb_tc_destroy(struct tc *tc)
2663 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2664 struct htb_class *hc, *next;
2666 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2667 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2675 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2677 const struct htb *htb = htb_get__(netdev);
2678 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2683 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2685 struct htb_class hc;
2688 htb_parse_qdisc_details__(netdev, details, &hc);
2689 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2690 tc_make_handle(1, 0), &hc);
2692 htb_get__(netdev)->max_rate = hc.max_rate;
2698 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2699 const struct tc_queue *queue, struct shash *details)
2701 const struct htb_class *hc = htb_class_cast__(queue);
2703 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2704 if (hc->min_rate != hc->max_rate) {
2705 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2707 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2709 shash_add(details, "priority", xasprintf("%u", hc->priority));
2715 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2716 const struct shash *details)
2718 struct htb_class hc;
2721 error = htb_parse_class_details__(netdev, details, &hc);
2726 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2727 tc_make_handle(1, 0xfffe), &hc);
2732 htb_update_queue__(netdev, queue_id, &hc);
2737 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2739 struct htb_class *hc = htb_class_cast__(queue);
2740 struct htb *htb = htb_get__(netdev);
2743 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2745 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2752 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2753 struct netdev_queue_stats *stats)
2755 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2756 tc_make_handle(1, 0xfffe), NULL, stats);
2760 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2761 const struct ofpbuf *nlmsg,
2762 netdev_dump_queue_stats_cb *cb, void *aux)
2764 struct netdev_queue_stats stats;
2765 unsigned int handle, major, minor;
2768 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2773 major = tc_get_major(handle);
2774 minor = tc_get_minor(handle);
2775 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2776 (*cb)(minor - 1, &stats, aux);
2781 static const struct tc_ops tc_ops_htb = {
2782 "htb", /* linux_name */
2783 "linux-htb", /* ovs_name */
2784 HTB_N_QUEUES, /* n_queues */
2793 htb_class_get_stats,
2794 htb_class_dump_stats
2797 /* "linux-hfsc" traffic control class. */
2799 #define HFSC_N_QUEUES 0xf000
2807 struct tc_queue tc_queue;
2812 static struct hfsc *
2813 hfsc_get__(const struct netdev *netdev)
2815 struct netdev_dev_linux *netdev_dev;
2816 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2817 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2820 static struct hfsc_class *
2821 hfsc_class_cast__(const struct tc_queue *queue)
2823 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2827 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2829 struct netdev_dev_linux * netdev_dev;
2832 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2833 hfsc = xmalloc(sizeof *hfsc);
2834 tc_init(&hfsc->tc, &tc_ops_hfsc);
2835 hfsc->max_rate = max_rate;
2836 netdev_dev->tc = &hfsc->tc;
2840 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2841 const struct hfsc_class *hc)
2845 struct hfsc_class *hcp;
2846 struct tc_queue *queue;
2848 hfsc = hfsc_get__(netdev);
2849 hash = hash_int(queue_id, 0);
2851 queue = tc_find_queue__(netdev, queue_id, hash);
2853 hcp = hfsc_class_cast__(queue);
2855 hcp = xmalloc(sizeof *hcp);
2856 queue = &hcp->tc_queue;
2857 queue->queue_id = queue_id;
2858 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2861 hcp->min_rate = hc->min_rate;
2862 hcp->max_rate = hc->max_rate;
2866 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2868 const struct tc_service_curve *rsc, *fsc, *usc;
2869 static const struct nl_policy tca_hfsc_policy[] = {
2871 .type = NL_A_UNSPEC,
2873 .min_len = sizeof(struct tc_service_curve),
2876 .type = NL_A_UNSPEC,
2878 .min_len = sizeof(struct tc_service_curve),
2881 .type = NL_A_UNSPEC,
2883 .min_len = sizeof(struct tc_service_curve),
2886 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2888 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2889 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2890 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2894 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2895 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2896 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2898 if (rsc->m1 != 0 || rsc->d != 0 ||
2899 fsc->m1 != 0 || fsc->d != 0 ||
2900 usc->m1 != 0 || usc->d != 0) {
2901 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2902 "Non-linear service curves are not supported.");
2906 if (rsc->m2 != fsc->m2) {
2907 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2908 "Real-time service curves are not supported ");
2912 if (rsc->m2 > usc->m2) {
2913 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2914 "Min-rate service curve is greater than "
2915 "the max-rate service curve.");
2919 class->min_rate = fsc->m2;
2920 class->max_rate = usc->m2;
2925 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2926 struct hfsc_class *options,
2927 struct netdev_queue_stats *stats)
2930 unsigned int handle;
2931 struct nlattr *nl_options;
2933 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2939 unsigned int major, minor;
2941 major = tc_get_major(handle);
2942 minor = tc_get_minor(handle);
2943 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2944 *queue_id = minor - 1;
2951 error = hfsc_parse_tca_options__(nl_options, options);
2958 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2959 unsigned int parent, struct hfsc_class *options,
2960 struct netdev_queue_stats *stats)
2963 struct ofpbuf *reply;
2965 error = tc_query_class(netdev, handle, parent, &reply);
2970 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2971 ofpbuf_delete(reply);
2976 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2977 struct hfsc_class *class)
2980 const char *max_rate_s;
2982 max_rate_s = shash_find_data(details, "max-rate");
2983 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2988 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2989 max_rate = netdev_features_to_bps(current) / 8;
2992 class->min_rate = max_rate;
2993 class->max_rate = max_rate;
2997 hfsc_parse_class_details__(struct netdev *netdev,
2998 const struct shash *details,
2999 struct hfsc_class * class)
3001 const struct hfsc *hfsc;
3002 uint32_t min_rate, max_rate;
3003 const char *min_rate_s, *max_rate_s;
3005 hfsc = hfsc_get__(netdev);
3006 min_rate_s = shash_find_data(details, "min-rate");
3007 max_rate_s = shash_find_data(details, "max-rate");
3009 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3010 min_rate = MAX(min_rate, 1);
3011 min_rate = MIN(min_rate, hfsc->max_rate);
3013 max_rate = (max_rate_s
3014 ? strtoull(max_rate_s, NULL, 10) / 8
3016 max_rate = MAX(max_rate, min_rate);
3017 max_rate = MIN(max_rate, hfsc->max_rate);
3019 class->min_rate = min_rate;
3020 class->max_rate = max_rate;
3025 /* Create an HFSC qdisc.
3027 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3029 hfsc_setup_qdisc__(struct netdev * netdev)
3031 struct tcmsg *tcmsg;
3032 struct ofpbuf request;
3033 struct tc_hfsc_qopt opt;
3035 tc_del_qdisc(netdev);
3037 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3038 NLM_F_EXCL | NLM_F_CREATE, &request);
3044 tcmsg->tcm_handle = tc_make_handle(1, 0);
3045 tcmsg->tcm_parent = TC_H_ROOT;
3047 memset(&opt, 0, sizeof opt);
3050 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3051 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3053 return tc_transact(&request, NULL);
3056 /* Create an HFSC class.
3058 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3059 * sc rate <min_rate> ul rate <max_rate>" */
3061 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3062 unsigned int parent, struct hfsc_class *class)
3066 struct tcmsg *tcmsg;
3067 struct ofpbuf request;
3068 struct tc_service_curve min, max;
3070 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3076 tcmsg->tcm_handle = handle;
3077 tcmsg->tcm_parent = parent;
3081 min.m2 = class->min_rate;
3085 max.m2 = class->max_rate;
3087 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3088 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3089 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3090 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3091 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3092 nl_msg_end_nested(&request, opt_offset);
3094 error = tc_transact(&request, NULL);
3096 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3097 "min-rate %ubps, max-rate %ubps (%s)",
3098 netdev_get_name(netdev),
3099 tc_get_major(handle), tc_get_minor(handle),
3100 tc_get_major(parent), tc_get_minor(parent),
3101 class->min_rate, class->max_rate, strerror(error));
3108 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3111 struct hfsc_class class;
3113 error = hfsc_setup_qdisc__(netdev);
3119 hfsc_parse_qdisc_details__(netdev, details, &class);
3120 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3121 tc_make_handle(1, 0), &class);
3127 hfsc_install__(netdev, class.max_rate);
3132 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3135 struct nl_dump dump;
3136 struct hfsc_class hc;
3139 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3140 hfsc_install__(netdev, hc.max_rate);
3142 if (!start_queue_dump(netdev, &dump)) {
3146 while (nl_dump_next(&dump, &msg)) {
3147 unsigned int queue_id;
3149 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3150 hfsc_update_queue__(netdev, queue_id, &hc);
3154 nl_dump_done(&dump);
3159 hfsc_tc_destroy(struct tc *tc)
3162 struct hfsc_class *hc, *next;
3164 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3166 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3167 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3176 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3178 const struct hfsc *hfsc;
3179 hfsc = hfsc_get__(netdev);
3180 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3185 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3188 struct hfsc_class class;
3190 hfsc_parse_qdisc_details__(netdev, details, &class);
3191 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3192 tc_make_handle(1, 0), &class);
3195 hfsc_get__(netdev)->max_rate = class.max_rate;
3202 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3203 const struct tc_queue *queue, struct shash *details)
3205 const struct hfsc_class *hc;
3207 hc = hfsc_class_cast__(queue);
3208 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3209 if (hc->min_rate != hc->max_rate) {
3210 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3216 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3217 const struct shash *details)
3220 struct hfsc_class class;
3222 error = hfsc_parse_class_details__(netdev, details, &class);
3227 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3228 tc_make_handle(1, 0xfffe), &class);
3233 hfsc_update_queue__(netdev, queue_id, &class);
3238 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3242 struct hfsc_class *hc;
3244 hc = hfsc_class_cast__(queue);
3245 hfsc = hfsc_get__(netdev);
3247 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3249 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3256 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3257 struct netdev_queue_stats *stats)
3259 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3260 tc_make_handle(1, 0xfffe), NULL, stats);
3264 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3265 const struct ofpbuf *nlmsg,
3266 netdev_dump_queue_stats_cb *cb, void *aux)
3268 struct netdev_queue_stats stats;
3269 unsigned int handle, major, minor;
3272 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3277 major = tc_get_major(handle);
3278 minor = tc_get_minor(handle);
3279 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3280 (*cb)(minor - 1, &stats, aux);
3285 static const struct tc_ops tc_ops_hfsc = {
3286 "hfsc", /* linux_name */
3287 "linux-hfsc", /* ovs_name */
3288 HFSC_N_QUEUES, /* n_queues */
3289 hfsc_tc_install, /* tc_install */
3290 hfsc_tc_load, /* tc_load */
3291 hfsc_tc_destroy, /* tc_destroy */
3292 hfsc_qdisc_get, /* qdisc_get */
3293 hfsc_qdisc_set, /* qdisc_set */
3294 hfsc_class_get, /* class_get */
3295 hfsc_class_set, /* class_set */
3296 hfsc_class_delete, /* class_delete */
3297 hfsc_class_get_stats, /* class_get_stats */
3298 hfsc_class_dump_stats /* class_dump_stats */
3301 /* "linux-default" traffic control class.
3303 * This class represents the default, unnamed Linux qdisc. It corresponds to
3304 * the "" (empty string) QoS type in the OVS database. */
3307 default_install__(struct netdev *netdev)
3309 struct netdev_dev_linux *netdev_dev =
3310 netdev_dev_linux_cast(netdev_get_dev(netdev));
3311 static struct tc *tc;
3314 tc = xmalloc(sizeof *tc);
3315 tc_init(tc, &tc_ops_default);
3317 netdev_dev->tc = tc;
3321 default_tc_install(struct netdev *netdev,
3322 const struct shash *details OVS_UNUSED)
3324 default_install__(netdev);
3329 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3331 default_install__(netdev);
3335 static const struct tc_ops tc_ops_default = {
3336 NULL, /* linux_name */
3341 NULL, /* tc_destroy */
3342 NULL, /* qdisc_get */
3343 NULL, /* qdisc_set */
3344 NULL, /* class_get */
3345 NULL, /* class_set */
3346 NULL, /* class_delete */
3347 NULL, /* class_get_stats */
3348 NULL /* class_dump_stats */
3351 /* "linux-other" traffic control class.
3356 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3358 struct netdev_dev_linux *netdev_dev =
3359 netdev_dev_linux_cast(netdev_get_dev(netdev));
3360 static struct tc *tc;
3363 tc = xmalloc(sizeof *tc);
3364 tc_init(tc, &tc_ops_other);
3366 netdev_dev->tc = tc;
3370 static const struct tc_ops tc_ops_other = {
3371 NULL, /* linux_name */
3372 "linux-other", /* ovs_name */
3374 NULL, /* tc_install */
3376 NULL, /* tc_destroy */
3377 NULL, /* qdisc_get */
3378 NULL, /* qdisc_set */
3379 NULL, /* class_get */
3380 NULL, /* class_set */
3381 NULL, /* class_delete */
3382 NULL, /* class_get_stats */
3383 NULL /* class_dump_stats */
3386 /* Traffic control. */
3388 /* Number of kernel "tc" ticks per second. */
3389 static double ticks_per_s;
3391 /* Number of kernel "jiffies" per second. This is used for the purpose of
3392 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3393 * one jiffy's worth of data.
3395 * There are two possibilities here:
3397 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3398 * approximate range of 100 to 1024. That means that we really need to
3399 * make sure that the qdisc can buffer that much data.
3401 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3402 * has finely granular timers and there's no need to fudge additional room
3403 * for buffers. (There's no extra effort needed to implement that: the
3404 * large 'buffer_hz' is used as a divisor, so practically any number will
3405 * come out as 0 in the division. Small integer results in the case of
3406 * really high dividends won't have any real effect anyhow.)
3408 static unsigned int buffer_hz;
3410 /* Returns tc handle 'major':'minor'. */
3412 tc_make_handle(unsigned int major, unsigned int minor)
3414 return TC_H_MAKE(major << 16, minor);
3417 /* Returns the major number from 'handle'. */
3419 tc_get_major(unsigned int handle)
3421 return TC_H_MAJ(handle) >> 16;
3424 /* Returns the minor number from 'handle'. */
3426 tc_get_minor(unsigned int handle)
3428 return TC_H_MIN(handle);
3431 static struct tcmsg *
3432 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3433 struct ofpbuf *request)
3435 struct tcmsg *tcmsg;
3439 error = get_ifindex(netdev, &ifindex);
3444 ofpbuf_init(request, 512);
3445 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3446 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3447 tcmsg->tcm_family = AF_UNSPEC;
3448 tcmsg->tcm_ifindex = ifindex;
3449 /* Caller should fill in tcmsg->tcm_handle. */
3450 /* Caller should fill in tcmsg->tcm_parent. */
3456 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3458 int error = nl_sock_transact(rtnl_sock, request, replyp);
3459 ofpbuf_uninit(request);
3463 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3464 * policing configuration.
3466 * This function is equivalent to running the following when 'add' is true:
3467 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3469 * This function is equivalent to running the following when 'add' is false:
3470 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3472 * The configuration and stats may be seen with the following command:
3473 * /sbin/tc -s qdisc show dev <devname>
3475 * Returns 0 if successful, otherwise a positive errno value.
3478 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3480 struct ofpbuf request;
3481 struct tcmsg *tcmsg;
3483 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3484 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3486 tcmsg = tc_make_request(netdev, type, flags, &request);
3490 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3491 tcmsg->tcm_parent = TC_H_INGRESS;
3492 nl_msg_put_string(&request, TCA_KIND, "ingress");
3493 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3495 error = tc_transact(&request, NULL);
3497 /* If we're deleting the qdisc, don't worry about some of the
3498 * error conditions. */
3499 if (!add && (error == ENOENT || error == EINVAL)) {
3508 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3511 * This function is equivalent to running:
3512 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3513 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3516 * The configuration and stats may be seen with the following command:
3517 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3519 * Returns 0 if successful, otherwise a positive errno value.
3522 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3524 struct tc_police tc_police;
3525 struct ofpbuf request;
3526 struct tcmsg *tcmsg;
3527 size_t basic_offset;
3528 size_t police_offset;
3532 memset(&tc_police, 0, sizeof tc_police);
3533 tc_police.action = TC_POLICE_SHOT;
3534 tc_police.mtu = mtu;
3535 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3536 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3537 kbits_burst * 1024);
3539 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3540 NLM_F_EXCL | NLM_F_CREATE, &request);
3544 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3545 tcmsg->tcm_info = tc_make_handle(49,
3546 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3548 nl_msg_put_string(&request, TCA_KIND, "basic");
3549 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3550 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3551 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3552 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3553 nl_msg_end_nested(&request, police_offset);
3554 nl_msg_end_nested(&request, basic_offset);
3556 error = tc_transact(&request, NULL);
3567 /* The values in psched are not individually very meaningful, but they are
3568 * important. The tables below show some values seen in the wild.
3572 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3573 * (Before that, there are hints that it was 1000000000.)
3575 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3579 * -----------------------------------
3580 * [1] 000c8000 000f4240 000f4240 00000064
3581 * [2] 000003e8 00000400 000f4240 3b9aca00
3582 * [3] 000003e8 00000400 000f4240 3b9aca00
3583 * [4] 000003e8 00000400 000f4240 00000064
3584 * [5] 000003e8 00000040 000f4240 3b9aca00
3585 * [6] 000003e8 00000040 000f4240 000000f9
3587 * a b c d ticks_per_s buffer_hz
3588 * ------- --------- ---------- ------------- ----------- -------------
3589 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3590 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3591 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3592 * [4] 1,000 1,024 1,000,000 100 976,562 100
3593 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3594 * [6] 1,000 64 1,000,000 249 15,625,000 249
3596 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3597 * [2] 2.6.26-1-686-bigmem from Debian lenny
3598 * [3] 2.6.26-2-sparc64 from Debian lenny
3599 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3600 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3601 * [6] 2.6.34 from kernel.org on KVM
3603 static const char fn[] = "/proc/net/psched";
3604 unsigned int a, b, c, d;
3610 stream = fopen(fn, "r");
3612 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3616 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3617 VLOG_WARN("%s: read failed", fn);
3621 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3625 VLOG_WARN("%s: invalid scheduler parameters", fn);
3629 ticks_per_s = (double) a * c / b;
3633 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3636 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3639 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3640 * rate of 'rate' bytes per second. */
3642 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3647 return (rate * ticks) / ticks_per_s;
3650 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3651 * rate of 'rate' bytes per second. */
3653 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3658 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3661 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3662 * a transmission rate of 'rate' bytes per second. */
3664 tc_buffer_per_jiffy(unsigned int rate)
3669 return rate / buffer_hz;
3672 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3673 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3674 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3675 * stores NULL into it if it is absent.
3677 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3680 * Returns 0 if successful, otherwise a positive errno value. */
3682 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3683 struct nlattr **options)
3685 static const struct nl_policy tca_policy[] = {
3686 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3687 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3689 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3691 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3692 tca_policy, ta, ARRAY_SIZE(ta))) {
3693 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3698 *kind = nl_attr_get_string(ta[TCA_KIND]);
3702 *options = ta[TCA_OPTIONS];
3717 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3718 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3719 * into '*options', and its queue statistics into '*stats'. Any of the output
3720 * arguments may be null.
3722 * Returns 0 if successful, otherwise a positive errno value. */
3724 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3725 struct nlattr **options, struct netdev_queue_stats *stats)
3727 static const struct nl_policy tca_policy[] = {
3728 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3729 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3731 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3733 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3734 tca_policy, ta, ARRAY_SIZE(ta))) {
3735 VLOG_WARN_RL(&rl, "failed to parse class message");
3740 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3741 *handlep = tc->tcm_handle;
3745 *options = ta[TCA_OPTIONS];
3749 const struct gnet_stats_queue *gsq;
3750 struct gnet_stats_basic gsb;
3752 static const struct nl_policy stats_policy[] = {
3753 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3754 .min_len = sizeof gsb },
3755 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3756 .min_len = sizeof *gsq },
3758 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3760 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3761 sa, ARRAY_SIZE(sa))) {
3762 VLOG_WARN_RL(&rl, "failed to parse class stats");
3766 /* Alignment issues screw up the length of struct gnet_stats_basic on
3767 * some arch/bitsize combinations. Newer versions of Linux have a
3768 * struct gnet_stats_basic_packed, but we can't depend on that. The
3769 * easiest thing to do is just to make a copy. */
3770 memset(&gsb, 0, sizeof gsb);
3771 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3772 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3773 stats->tx_bytes = gsb.bytes;
3774 stats->tx_packets = gsb.packets;
3776 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3777 stats->tx_errors = gsq->drops;
3787 memset(stats, 0, sizeof *stats);
3792 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3795 tc_query_class(const struct netdev *netdev,
3796 unsigned int handle, unsigned int parent,
3797 struct ofpbuf **replyp)
3799 struct ofpbuf request;
3800 struct tcmsg *tcmsg;
3803 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3807 tcmsg->tcm_handle = handle;
3808 tcmsg->tcm_parent = parent;
3810 error = tc_transact(&request, replyp);
3812 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3813 netdev_get_name(netdev),
3814 tc_get_major(handle), tc_get_minor(handle),
3815 tc_get_major(parent), tc_get_minor(parent),
3821 /* Equivalent to "tc class del dev <name> handle <handle>". */
3823 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3825 struct ofpbuf request;
3826 struct tcmsg *tcmsg;
3829 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3833 tcmsg->tcm_handle = handle;
3834 tcmsg->tcm_parent = 0;
3836 error = tc_transact(&request, NULL);
3838 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3839 netdev_get_name(netdev),
3840 tc_get_major(handle), tc_get_minor(handle),
3846 /* Equivalent to "tc qdisc del dev <name> root". */
3848 tc_del_qdisc(struct netdev *netdev)
3850 struct netdev_dev_linux *netdev_dev =
3851 netdev_dev_linux_cast(netdev_get_dev(netdev));
3852 struct ofpbuf request;
3853 struct tcmsg *tcmsg;
3856 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3860 tcmsg->tcm_handle = tc_make_handle(1, 0);
3861 tcmsg->tcm_parent = TC_H_ROOT;
3863 error = tc_transact(&request, NULL);
3864 if (error == EINVAL) {
3865 /* EINVAL probably means that the default qdisc was in use, in which
3866 * case we've accomplished our purpose. */
3869 if (!error && netdev_dev->tc) {
3870 if (netdev_dev->tc->ops->tc_destroy) {
3871 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3873 netdev_dev->tc = NULL;
3878 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3879 * kernel to determine what they are. Returns 0 if successful, otherwise a
3880 * positive errno value. */
3882 tc_query_qdisc(const struct netdev *netdev)
3884 struct netdev_dev_linux *netdev_dev =
3885 netdev_dev_linux_cast(netdev_get_dev(netdev));
3886 struct ofpbuf request, *qdisc;
3887 const struct tc_ops *ops;
3888 struct tcmsg *tcmsg;
3892 if (netdev_dev->tc) {
3896 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3897 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3898 * 2.6.35 without that fix backported to it.
3900 * To avoid the OOPS, we must not make a request that would attempt to dump
3901 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3902 * few others. There are a few ways that I can see to do this, but most of
3903 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3904 * technique chosen here is to assume that any non-default qdisc that we
3905 * create will have a class with handle 1:0. The built-in qdiscs only have
3906 * a class with handle 0:0.
3908 * We could check for Linux 2.6.35+ and use a more straightforward method
3910 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3914 tcmsg->tcm_handle = tc_make_handle(1, 0);
3915 tcmsg->tcm_parent = 0;
3917 /* Figure out what tc class to instantiate. */
3918 error = tc_transact(&request, &qdisc);
3922 error = tc_parse_qdisc(qdisc, &kind, NULL);
3924 ops = &tc_ops_other;
3926 ops = tc_lookup_linux_name(kind);
3928 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3929 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3931 ops = &tc_ops_other;
3934 } else if (error == ENOENT) {
3935 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3936 * other entity that doesn't have a handle 1:0. We will assume
3937 * that it's the system default qdisc. */
3938 ops = &tc_ops_default;
3941 /* Who knows? Maybe the device got deleted. */
3942 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3943 netdev_get_name(netdev), strerror(error));
3944 ops = &tc_ops_other;
3947 /* Instantiate it. */
3948 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3949 assert((load_error == 0) == (netdev_dev->tc != NULL));
3950 ofpbuf_delete(qdisc);
3952 return error ? error : load_error;
3955 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3956 approximate the time to transmit packets of various lengths. For an MTU of
3957 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3958 represents two possible packet lengths; for a MTU of 513 through 1024, four
3959 possible lengths; and so on.
3961 Returns, for the specified 'mtu', the number of bits that packet lengths
3962 need to be shifted right to fit within such a 256-entry table. */
3964 tc_calc_cell_log(unsigned int mtu)
3969 mtu = ETH_PAYLOAD_MAX;
3971 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3973 for (cell_log = 0; mtu >= 256; cell_log++) {
3980 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3983 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3985 memset(rate, 0, sizeof *rate);
3986 rate->cell_log = tc_calc_cell_log(mtu);
3987 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3988 /* rate->cell_align = 0; */ /* distro headers. */
3989 rate->mpu = ETH_TOTAL_MIN;
3993 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3994 * attribute of the specified "type".
3996 * See tc_calc_cell_log() above for a description of "rtab"s. */
3998 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4003 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4004 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4005 unsigned packet_size = (i + 1) << rate->cell_log;
4006 if (packet_size < rate->mpu) {
4007 packet_size = rate->mpu;
4009 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4013 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4014 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4015 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4018 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4020 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4021 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4024 /* Linux-only functions declared in netdev-linux.h */
4026 /* Returns a fd for an AF_INET socket or a negative errno value. */
4028 netdev_linux_get_af_inet_sock(void)
4030 int error = netdev_linux_init();
4031 return error ? -error : af_inet_sock;
4034 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4035 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4037 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4038 const char *flag_name, bool enable)
4040 const char *netdev_name = netdev_get_name(netdev);
4041 struct ethtool_value evalue;
4045 memset(&evalue, 0, sizeof evalue);
4046 error = netdev_linux_do_ethtool(netdev_name,
4047 (struct ethtool_cmd *)&evalue,
4048 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4053 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4054 error = netdev_linux_do_ethtool(netdev_name,
4055 (struct ethtool_cmd *)&evalue,
4056 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4061 memset(&evalue, 0, sizeof evalue);
4062 error = netdev_linux_do_ethtool(netdev_name,
4063 (struct ethtool_cmd *)&evalue,
4064 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4069 if (new_flags != evalue.data) {
4070 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4071 "device %s failed", enable ? "enable" : "disable",
4072 flag_name, netdev_name);
4079 /* Utility functions. */
4081 /* Copies 'src' into 'dst', performing format conversion in the process. */
4083 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4084 const struct rtnl_link_stats *src)
4086 dst->rx_packets = src->rx_packets;
4087 dst->tx_packets = src->tx_packets;
4088 dst->rx_bytes = src->rx_bytes;
4089 dst->tx_bytes = src->tx_bytes;
4090 dst->rx_errors = src->rx_errors;
4091 dst->tx_errors = src->tx_errors;
4092 dst->rx_dropped = src->rx_dropped;
4093 dst->tx_dropped = src->tx_dropped;
4094 dst->multicast = src->multicast;
4095 dst->collisions = src->collisions;
4096 dst->rx_length_errors = src->rx_length_errors;
4097 dst->rx_over_errors = src->rx_over_errors;
4098 dst->rx_crc_errors = src->rx_crc_errors;
4099 dst->rx_frame_errors = src->rx_frame_errors;
4100 dst->rx_fifo_errors = src->rx_fifo_errors;
4101 dst->rx_missed_errors = src->rx_missed_errors;
4102 dst->tx_aborted_errors = src->tx_aborted_errors;
4103 dst->tx_carrier_errors = src->tx_carrier_errors;
4104 dst->tx_fifo_errors = src->tx_fifo_errors;
4105 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4106 dst->tx_window_errors = src->tx_window_errors;
4110 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4112 /* Policy for RTNLGRP_LINK messages.
4114 * There are *many* more fields in these messages, but currently we only
4115 * care about these fields. */
4116 static const struct nl_policy rtnlgrp_link_policy[] = {
4117 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4118 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4119 .min_len = sizeof(struct rtnl_link_stats) },
4122 struct ofpbuf request;
4123 struct ofpbuf *reply;
4124 struct ifinfomsg *ifi;
4125 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4128 ofpbuf_init(&request, 0);
4129 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4130 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4131 ifi->ifi_family = PF_UNSPEC;
4132 ifi->ifi_index = ifindex;
4133 error = nl_sock_transact(rtnl_sock, &request, &reply);
4134 ofpbuf_uninit(&request);
4139 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4140 rtnlgrp_link_policy,
4141 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4142 ofpbuf_delete(reply);
4146 if (!attrs[IFLA_STATS]) {
4147 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4148 ofpbuf_delete(reply);
4152 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4154 ofpbuf_delete(reply);
4160 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4162 static const char fn[] = "/proc/net/dev";
4167 stream = fopen(fn, "r");
4169 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4174 while (fgets(line, sizeof line, stream)) {
4177 #define X64 "%"SCNu64
4180 X64 X64 X64 X64 X64 X64 X64 "%*u"
4181 X64 X64 X64 X64 X64 X64 X64 "%*u",
4187 &stats->rx_fifo_errors,
4188 &stats->rx_frame_errors,
4194 &stats->tx_fifo_errors,
4196 &stats->tx_carrier_errors) != 15) {
4197 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4198 } else if (!strcmp(devname, netdev_name)) {
4199 stats->rx_length_errors = UINT64_MAX;
4200 stats->rx_over_errors = UINT64_MAX;
4201 stats->rx_crc_errors = UINT64_MAX;
4202 stats->rx_missed_errors = UINT64_MAX;
4203 stats->tx_aborted_errors = UINT64_MAX;
4204 stats->tx_heartbeat_errors = UINT64_MAX;
4205 stats->tx_window_errors = UINT64_MAX;
4211 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4217 get_carrier_via_sysfs(const char *name, bool *carrier)
4228 fn = xasprintf("/sys/class/net/%s/carrier", name);
4229 fd = open(fn, O_RDONLY);
4232 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4236 retval = read(fd, line, sizeof line);
4239 if (error == EINVAL) {
4240 /* This is the normal return value when we try to check carrier if
4241 * the network device is not up. */
4243 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4246 } else if (retval == 0) {
4248 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4252 if (line[0] != '0' && line[0] != '1') {
4254 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4257 *carrier = line[0] != '0';
4269 get_flags(const struct netdev *netdev, int *flags)
4274 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4276 *flags = ifr.ifr_flags;
4281 set_flags(struct netdev *netdev, int flags)
4285 ifr.ifr_flags = flags;
4286 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4291 do_get_ifindex(const char *netdev_name)
4295 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4296 COVERAGE_INC(netdev_get_ifindex);
4297 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4298 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4299 netdev_name, strerror(errno));
4302 return ifr.ifr_ifindex;
4306 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4308 struct netdev_dev_linux *netdev_dev =
4309 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4311 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4312 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4316 netdev_dev->cache_valid |= VALID_IFINDEX;
4317 netdev_dev->ifindex = ifindex;
4319 *ifindexp = netdev_dev->ifindex;
4324 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4329 memset(&ifr, 0, sizeof ifr);
4330 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4331 COVERAGE_INC(netdev_get_hwaddr);
4332 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4333 /* ENODEV probably means that a vif disappeared asynchronously and
4334 * hasn't been removed from the database yet, so reduce the log level
4335 * to INFO for that case. */
4336 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4337 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4338 netdev_name, strerror(errno));
4341 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4342 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4343 VLOG_WARN("%s device has unknown hardware address family %d",
4344 netdev_name, hwaddr_family);
4346 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4351 set_etheraddr(const char *netdev_name, int hwaddr_family,
4352 const uint8_t mac[ETH_ADDR_LEN])
4356 memset(&ifr, 0, sizeof ifr);
4357 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4358 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4359 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4360 COVERAGE_INC(netdev_set_hwaddr);
4361 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4362 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4363 netdev_name, strerror(errno));
4370 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4371 int cmd, const char *cmd_name)
4375 memset(&ifr, 0, sizeof ifr);
4376 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4377 ifr.ifr_data = (caddr_t) ecmd;
4380 COVERAGE_INC(netdev_ethtool);
4381 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4384 if (errno != EOPNOTSUPP) {
4385 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4386 "failed: %s", cmd_name, name, strerror(errno));
4388 /* The device doesn't support this operation. That's pretty
4389 * common, so there's no point in logging anything. */
4396 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4397 const char *cmd_name)
4399 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4400 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4401 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4409 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4410 int cmd, const char *cmd_name)
4415 ifr.ifr_addr.sa_family = AF_INET;
4416 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4418 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4419 *ip = sin->sin_addr;
4424 /* Returns an AF_PACKET raw socket or a negative errno value. */
4426 af_packet_sock(void)
4428 static int sock = INT_MIN;
4430 if (sock == INT_MIN) {
4431 sock = socket(AF_PACKET, SOCK_RAW, 0);
4433 set_nonblocking(sock);
4436 VLOG_ERR("failed to create packet socket: %s", strerror(errno));