2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
30 #include <linux/types.h>
31 #include <linux/ethtool.h>
32 #include <linux/mii.h>
33 #include <linux/pkt_cls.h>
34 #include <linux/pkt_sched.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <linux/version.h>
38 #include <sys/types.h>
39 #include <sys/ioctl.h>
40 #include <sys/socket.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openflow/openflow.h"
66 #include "poll-loop.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_set_policing);
77 COVERAGE_DEFINE(netdev_arp_lookup);
78 COVERAGE_DEFINE(netdev_get_ifindex);
79 COVERAGE_DEFINE(netdev_get_hwaddr);
80 COVERAGE_DEFINE(netdev_set_hwaddr);
81 COVERAGE_DEFINE(netdev_ethtool);
83 /* These were introduced in Linux 2.6.14, so they might be missing if we have
85 #ifndef ADVERTISED_Pause
86 #define ADVERTISED_Pause (1 << 13)
88 #ifndef ADVERTISED_Asym_Pause
89 #define ADVERTISED_Asym_Pause (1 << 14)
92 /* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94 #ifndef ETHTOOL_GFLAGS
95 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97 #ifndef ETHTOOL_SFLAGS
98 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
101 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
104 #define TC_RTAB_SIZE 1024
107 static struct nln_notifier *netdev_linux_cache_notifier = NULL;
108 static int cache_notifier_refcount;
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
116 VALID_POLICING = 1 << 5,
117 VALID_HAVE_VPORT_STATS = 1 << 6
125 /* Traffic control. */
127 /* An instance of a traffic control class. Always associated with a particular
130 * Each TC implementation subclasses this with whatever additional data it
133 const struct tc_ops *ops;
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
139 /* One traffic control queue.
141 * Each TC implementation subclasses this with whatever additional data it
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
148 /* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
220 * This function may be null if 'tc' is not configurable.
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
231 * This function may be null if 'tc' is not configurable.
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
246 * This function may be null if 'tc' does not have queues ('n_queues' is
248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
249 struct shash *details);
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
275 * On success, initializes '*stats'.
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
281 struct netdev_queue_stats *stats);
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
294 tc_init(struct tc *tc, const struct tc_ops *ops)
297 hmap_init(&tc->queues);
301 tc_destroy(struct tc *tc)
303 hmap_destroy(&tc->queues);
306 static const struct tc_ops tc_ops_htb;
307 static const struct tc_ops tc_ops_hfsc;
308 static const struct tc_ops tc_ops_default;
309 static const struct tc_ops tc_ops_other;
311 static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
319 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320 static unsigned int tc_get_major(unsigned int handle);
321 static unsigned int tc_get_minor(unsigned int handle);
323 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
327 static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330 static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
331 static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
335 struct nlattr **options);
336 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
337 struct nlattr **options,
338 struct netdev_queue_stats *);
339 static int tc_query_class(const struct netdev *,
340 unsigned int handle, unsigned int parent,
341 struct ofpbuf **replyp);
342 static int tc_delete_class(const struct netdev *, unsigned int handle);
344 static int tc_del_qdisc(struct netdev *netdev);
345 static int tc_query_qdisc(const struct netdev *netdev);
347 static int tc_calc_cell_log(unsigned int mtu);
348 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
349 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
350 const struct tc_ratespec *rate);
351 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
353 struct netdev_dev_linux {
354 struct netdev_dev netdev_dev;
356 struct shash_node *shash_node;
357 unsigned int cache_valid;
358 unsigned int change_seq;
360 bool miimon; /* Link status of last poll. */
361 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
362 struct timer miimon_timer;
364 /* The following are figured out "on demand" only. They are only valid
365 * when the corresponding VALID_* bit in 'cache_valid' is set. */
367 uint8_t etheraddr[ETH_ADDR_LEN];
368 struct in_addr address, netmask;
371 unsigned int ifi_flags;
372 long long int carrier_resets;
373 uint32_t kbits_rate; /* Policing data. */
374 uint32_t kbits_burst;
375 bool have_vport_stats;
379 struct tap_state tap;
383 struct netdev_linux {
384 struct netdev netdev;
388 /* Sockets used for ioctl operations. */
389 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
391 /* A Netlink routing socket that is not subscribed to any multicast groups. */
392 static struct nl_sock *rtnl_sock;
394 /* This is set pretty low because we probably won't learn anything from the
395 * additional log messages. */
396 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
398 static int netdev_linux_init(void);
400 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
401 int cmd, const char *cmd_name);
402 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
403 const char *cmd_name);
404 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
405 int cmd, const char *cmd_name);
406 static int get_flags(const struct netdev_dev *, unsigned int *flags);
407 static int set_flags(struct netdev *, unsigned int flags);
408 static int do_get_ifindex(const char *netdev_name);
409 static int get_ifindex(const struct netdev *, int *ifindexp);
410 static int do_set_addr(struct netdev *netdev,
411 int ioctl_nr, const char *ioctl_name,
412 struct in_addr addr);
413 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
414 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
415 const uint8_t[ETH_ADDR_LEN]);
416 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
417 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
418 static int af_packet_sock(void);
419 static void netdev_linux_miimon_run(void);
420 static void netdev_linux_miimon_wait(void);
423 is_netdev_linux_class(const struct netdev_class *netdev_class)
425 return netdev_class->init == netdev_linux_init;
428 static struct netdev_dev_linux *
429 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
431 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
432 assert(is_netdev_linux_class(netdev_class));
434 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
437 static struct netdev_linux *
438 netdev_linux_cast(const struct netdev *netdev)
440 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
441 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
442 assert(is_netdev_linux_class(netdev_class));
444 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
448 netdev_linux_init(void)
450 static int status = -1;
452 /* Create AF_INET socket. */
453 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
454 status = af_inet_sock >= 0 ? 0 : errno;
456 VLOG_ERR("failed to create inet socket: %s", strerror(status));
459 /* Create rtnetlink socket. */
461 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
463 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
472 netdev_linux_run(void)
474 rtnetlink_link_run();
475 netdev_linux_miimon_run();
479 netdev_linux_wait(void)
481 rtnetlink_link_wait();
482 netdev_linux_miimon_wait();
486 netdev_dev_linux_changed(struct netdev_dev_linux *dev, unsigned int ifi_flags)
489 if (!dev->change_seq) {
493 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
494 dev->carrier_resets++;
496 dev->ifi_flags = ifi_flags;
498 dev->cache_valid = 0;
502 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
503 void *aux OVS_UNUSED)
505 struct netdev_dev_linux *dev;
507 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
509 const struct netdev_class *netdev_class =
510 netdev_dev_get_class(base_dev);
512 if (is_netdev_linux_class(netdev_class)) {
513 dev = netdev_dev_linux_cast(base_dev);
514 netdev_dev_linux_changed(dev, change->ifi_flags);
518 struct shash device_shash;
519 struct shash_node *node;
521 shash_init(&device_shash);
522 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
523 SHASH_FOR_EACH (node, &device_shash) {
528 get_flags(&dev->netdev_dev, &flags);
529 netdev_dev_linux_changed(dev, flags);
531 shash_destroy(&device_shash);
536 cache_notifier_ref(void)
538 if (!cache_notifier_refcount) {
539 assert(!netdev_linux_cache_notifier);
541 netdev_linux_cache_notifier =
542 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
544 if (!netdev_linux_cache_notifier) {
548 cache_notifier_refcount++;
554 cache_notifier_unref(void)
556 assert(cache_notifier_refcount > 0);
557 if (!--cache_notifier_refcount) {
558 assert(netdev_linux_cache_notifier);
559 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
560 netdev_linux_cache_notifier = NULL;
564 /* Creates system and internal devices. */
566 netdev_linux_create(const struct netdev_class *class, const char *name,
567 struct netdev_dev **netdev_devp)
569 struct netdev_dev_linux *netdev_dev;
572 error = cache_notifier_ref();
577 netdev_dev = xzalloc(sizeof *netdev_dev);
578 netdev_dev->change_seq = 1;
579 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
580 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
582 *netdev_devp = &netdev_dev->netdev_dev;
586 /* For most types of netdevs we open the device for each call of
587 * netdev_open(). However, this is not the case with tap devices,
588 * since it is only possible to open the device once. In this
589 * situation we share a single file descriptor, and consequently
590 * buffers, across all readers. Therefore once data is read it will
591 * be unavailable to other reads for tap devices. */
593 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
594 const char *name, struct netdev_dev **netdev_devp)
596 struct netdev_dev_linux *netdev_dev;
597 struct tap_state *state;
598 static const char tap_dev[] = "/dev/net/tun";
602 netdev_dev = xzalloc(sizeof *netdev_dev);
603 state = &netdev_dev->state.tap;
605 error = cache_notifier_ref();
610 /* Open tap device. */
611 state->fd = open(tap_dev, O_RDWR);
614 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
615 goto error_unref_notifier;
618 /* Create tap device. */
619 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
620 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
621 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
622 VLOG_WARN("%s: creating tap device failed: %s", name,
625 goto error_unref_notifier;
628 /* Make non-blocking. */
629 error = set_nonblocking(state->fd);
631 goto error_unref_notifier;
634 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
635 *netdev_devp = &netdev_dev->netdev_dev;
638 error_unref_notifier:
639 cache_notifier_unref();
646 destroy_tap(struct netdev_dev_linux *netdev_dev)
648 struct tap_state *state = &netdev_dev->state.tap;
650 if (state->fd >= 0) {
655 /* Destroys the netdev device 'netdev_dev_'. */
657 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
659 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
660 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
662 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
663 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
666 if (class == &netdev_tap_class) {
667 destroy_tap(netdev_dev);
671 cache_notifier_unref();
675 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
677 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
678 struct netdev_linux *netdev;
679 enum netdev_flags flags;
682 /* Allocate network device. */
683 netdev = xzalloc(sizeof *netdev);
685 netdev_init(&netdev->netdev, netdev_dev_);
687 /* Verify that the device really exists, by attempting to read its flags.
688 * (The flags might be cached, in which case this won't actually do an
691 * Don't do this for "internal" netdevs, though, because those have to be
692 * created as netdev objects before they exist in the kernel, because
693 * creating them in the kernel happens by passing a netdev object to
694 * dpif_port_add(). */
695 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
696 error = netdev_get_flags(&netdev->netdev, &flags);
697 if (error == ENODEV) {
702 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
703 !netdev_dev->state.tap.opened) {
705 /* We assume that the first user of the tap device is the primary user
706 * and give them the tap FD. Subsequent users probably just expect
707 * this to be a system device so open it normally to avoid send/receive
708 * directions appearing to be reversed. */
709 netdev->fd = netdev_dev->state.tap.fd;
710 netdev_dev->state.tap.opened = true;
713 *netdevp = &netdev->netdev;
717 netdev_uninit(&netdev->netdev, true);
721 /* Closes and destroys 'netdev'. */
723 netdev_linux_close(struct netdev *netdev_)
725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
727 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
734 netdev_linux_listen(struct netdev *netdev_)
736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
737 struct sockaddr_ll sll;
742 if (netdev->fd >= 0) {
746 /* Create file descriptor. */
747 fd = socket(PF_PACKET, SOCK_RAW, 0);
750 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
754 /* Set non-blocking mode. */
755 error = set_nonblocking(fd);
760 /* Get ethernet device index. */
761 error = get_ifindex(&netdev->netdev, &ifindex);
766 /* Bind to specific ethernet device. */
767 memset(&sll, 0, sizeof sll);
768 sll.sll_family = AF_PACKET;
769 sll.sll_ifindex = ifindex;
770 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
771 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
773 VLOG_ERR("%s: failed to bind raw socket (%s)",
774 netdev_get_name(netdev_), strerror(error));
789 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
791 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
793 if (netdev->fd < 0) {
794 /* Device is not listening. */
799 ssize_t retval = recv(netdev->fd, data, size, MSG_TRUNC);
801 return retval <= size ? retval : -EMSGSIZE;
802 } else if (errno != EINTR) {
803 if (errno != EAGAIN) {
804 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
805 strerror(errno), netdev_get_name(netdev_));
812 /* Registers with the poll loop to wake up from the next call to poll_block()
813 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
815 netdev_linux_recv_wait(struct netdev *netdev_)
817 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
818 if (netdev->fd >= 0) {
819 poll_fd_wait(netdev->fd, POLLIN);
823 /* Discards all packets waiting to be received from 'netdev'. */
825 netdev_linux_drain(struct netdev *netdev_)
827 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
828 if (netdev->fd < 0) {
830 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
832 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
833 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
837 drain_fd(netdev->fd, ifr.ifr_qlen);
840 return drain_rcvbuf(netdev->fd);
844 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
845 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
846 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
847 * the packet is too big or too small to transmit on the device.
849 * The caller retains ownership of 'buffer' in all cases.
851 * The kernel maintains a packet transmission queue, so the caller is not
852 * expected to do additional queuing of packets. */
854 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
856 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
860 if (netdev->fd < 0) {
861 /* Use our AF_PACKET socket to send to this device. */
862 struct sockaddr_ll sll;
869 sock = af_packet_sock();
874 error = get_ifindex(netdev_, &ifindex);
879 /* We don't bother setting most fields in sockaddr_ll because the
880 * kernel ignores them for SOCK_RAW. */
881 memset(&sll, 0, sizeof sll);
882 sll.sll_family = AF_PACKET;
883 sll.sll_ifindex = ifindex;
885 iov.iov_base = (void *) data;
889 msg.msg_namelen = sizeof sll;
892 msg.msg_control = NULL;
893 msg.msg_controllen = 0;
896 retval = sendmsg(sock, &msg, 0);
898 /* Use the netdev's own fd to send to this device. This is
899 * essential for tap devices, because packets sent to a tap device
900 * with an AF_PACKET socket will loop back to be *received* again
901 * on the tap device. */
902 retval = write(netdev->fd, data, size);
906 /* The Linux AF_PACKET implementation never blocks waiting for room
907 * for packets, instead returning ENOBUFS. Translate this into
908 * EAGAIN for the caller. */
909 if (errno == ENOBUFS) {
911 } else if (errno == EINTR) {
913 } else if (errno != EAGAIN) {
914 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
915 netdev_get_name(netdev_), strerror(errno));
918 } else if (retval != size) {
919 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
920 "%zu) on %s", retval, size, netdev_get_name(netdev_));
928 /* Registers with the poll loop to wake up from the next call to poll_block()
929 * when the packet transmission queue has sufficient room to transmit a packet
930 * with netdev_send().
932 * The kernel maintains a packet transmission queue, so the client is not
933 * expected to do additional queuing of packets. Thus, this function is
934 * unlikely to ever be used. It is included for completeness. */
936 netdev_linux_send_wait(struct netdev *netdev_)
938 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
939 if (netdev->fd < 0) {
941 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
942 poll_fd_wait(netdev->fd, POLLOUT);
944 /* TAP device always accepts packets.*/
945 poll_immediate_wake();
949 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
950 * otherwise a positive errno value. */
952 netdev_linux_set_etheraddr(struct netdev *netdev_,
953 const uint8_t mac[ETH_ADDR_LEN])
955 struct netdev_dev_linux *netdev_dev =
956 netdev_dev_linux_cast(netdev_get_dev(netdev_));
959 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
960 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
961 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
963 netdev_dev->cache_valid |= VALID_ETHERADDR;
964 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
972 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
973 * free the returned buffer. */
975 netdev_linux_get_etheraddr(const struct netdev *netdev_,
976 uint8_t mac[ETH_ADDR_LEN])
978 struct netdev_dev_linux *netdev_dev =
979 netdev_dev_linux_cast(netdev_get_dev(netdev_));
980 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
981 int error = get_etheraddr(netdev_get_name(netdev_),
982 netdev_dev->etheraddr);
986 netdev_dev->cache_valid |= VALID_ETHERADDR;
988 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
992 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
993 * in bytes, not including the hardware header; thus, this is typically 1500
994 * bytes for Ethernet devices. */
996 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
998 struct netdev_dev_linux *netdev_dev =
999 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1000 if (!(netdev_dev->cache_valid & VALID_MTU)) {
1004 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1005 SIOCGIFMTU, "SIOCGIFMTU");
1009 netdev_dev->mtu = ifr.ifr_mtu;
1010 netdev_dev->cache_valid |= VALID_MTU;
1012 *mtup = netdev_dev->mtu;
1016 /* Sets the maximum size of transmitted (MTU) for given device using linux
1017 * networking ioctl interface.
1020 netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1022 struct netdev_dev_linux *netdev_dev =
1023 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1027 if (netdev_dev->cache_valid & VALID_MTU &&
1028 netdev_dev->mtu == mtu) {
1032 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1033 SIOCSIFMTU, "SIOCSIFMTU");
1038 netdev_dev->mtu = ifr.ifr_mtu;
1039 netdev_dev->cache_valid |= VALID_MTU;
1043 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1044 * On failure, returns a negative errno value. */
1046 netdev_linux_get_ifindex(const struct netdev *netdev)
1050 error = get_ifindex(netdev, &ifindex);
1051 return error ? -error : ifindex;
1055 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1057 struct netdev_dev_linux *netdev_dev =
1058 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1060 if (netdev_dev->miimon_interval > 0) {
1061 *carrier = netdev_dev->miimon;
1063 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
1069 static long long int
1070 netdev_linux_get_carrier_resets(const struct netdev *netdev)
1072 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1076 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1077 struct mii_ioctl_data *data)
1082 memset(&ifr, 0, sizeof ifr);
1083 memcpy(&ifr.ifr_data, data, sizeof *data);
1084 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1085 memcpy(data, &ifr.ifr_data, sizeof *data);
1091 netdev_linux_get_miimon(const char *name, bool *miimon)
1093 struct mii_ioctl_data data;
1098 memset(&data, 0, sizeof data);
1099 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1101 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1102 data.reg_num = MII_BMSR;
1103 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1107 *miimon = !!(data.val_out & BMSR_LSTATUS);
1109 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1112 struct ethtool_cmd ecmd;
1114 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1117 memset(&ecmd, 0, sizeof ecmd);
1118 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1121 struct ethtool_value eval;
1123 memcpy(&eval, &ecmd, sizeof eval);
1124 *miimon = !!eval.data;
1126 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1134 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1135 long long int interval)
1137 struct netdev_dev_linux *netdev_dev;
1139 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1141 interval = interval > 0 ? MAX(interval, 100) : 0;
1142 if (netdev_dev->miimon_interval != interval) {
1143 netdev_dev->miimon_interval = interval;
1144 timer_set_expired(&netdev_dev->miimon_timer);
1151 netdev_linux_miimon_run(void)
1153 struct shash device_shash;
1154 struct shash_node *node;
1156 shash_init(&device_shash);
1157 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1158 SHASH_FOR_EACH (node, &device_shash) {
1159 struct netdev_dev_linux *dev = node->data;
1162 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1166 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1167 if (miimon != dev->miimon) {
1168 dev->miimon = miimon;
1169 netdev_dev_linux_changed(dev, dev->ifi_flags);
1172 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1175 shash_destroy(&device_shash);
1179 netdev_linux_miimon_wait(void)
1181 struct shash device_shash;
1182 struct shash_node *node;
1184 shash_init(&device_shash);
1185 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1186 SHASH_FOR_EACH (node, &device_shash) {
1187 struct netdev_dev_linux *dev = node->data;
1189 if (dev->miimon_interval > 0) {
1190 timer_wait(&dev->miimon_timer);
1193 shash_destroy(&device_shash);
1196 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1197 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1200 check_for_working_netlink_stats(void)
1202 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1203 * preferable, so if that works, we'll use it. */
1204 int ifindex = do_get_ifindex("lo");
1206 VLOG_WARN("failed to get ifindex for lo, "
1207 "obtaining netdev stats from proc");
1210 struct netdev_stats stats;
1211 int error = get_stats_via_netlink(ifindex, &stats);
1213 VLOG_DBG("obtaining netdev stats via rtnetlink");
1216 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1217 "via proc (you are probably running a pre-2.6.19 "
1218 "kernel)", strerror(error));
1225 swap_uint64(uint64_t *a, uint64_t *b)
1233 get_stats_via_vport(const struct netdev *netdev_,
1234 struct netdev_stats *stats)
1236 struct netdev_dev_linux *netdev_dev =
1237 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1239 if (netdev_dev->have_vport_stats ||
1240 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1243 error = netdev_vport_get_stats(netdev_, stats);
1245 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1246 "(%s)", netdev_get_name(netdev_), strerror(error));
1248 netdev_dev->have_vport_stats = !error;
1249 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1254 netdev_linux_sys_get_stats(const struct netdev *netdev_,
1255 struct netdev_stats *stats)
1257 static int use_netlink_stats = -1;
1260 if (use_netlink_stats < 0) {
1261 use_netlink_stats = check_for_working_netlink_stats();
1264 if (use_netlink_stats) {
1267 error = get_ifindex(netdev_, &ifindex);
1269 error = get_stats_via_netlink(ifindex, stats);
1272 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1276 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1277 netdev_get_name(netdev_), error);
1283 /* Retrieves current device stats for 'netdev-linux'. */
1285 netdev_linux_get_stats(const struct netdev *netdev_,
1286 struct netdev_stats *stats)
1288 struct netdev_dev_linux *netdev_dev =
1289 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1290 struct netdev_stats dev_stats;
1293 get_stats_via_vport(netdev_, stats);
1295 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1298 if (!netdev_dev->have_vport_stats) {
1305 if (!netdev_dev->have_vport_stats) {
1306 /* stats not available from OVS then use ioctl stats. */
1309 stats->rx_errors += dev_stats.rx_errors;
1310 stats->tx_errors += dev_stats.tx_errors;
1311 stats->rx_dropped += dev_stats.rx_dropped;
1312 stats->tx_dropped += dev_stats.tx_dropped;
1313 stats->multicast += dev_stats.multicast;
1314 stats->collisions += dev_stats.collisions;
1315 stats->rx_length_errors += dev_stats.rx_length_errors;
1316 stats->rx_over_errors += dev_stats.rx_over_errors;
1317 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1318 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1319 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1320 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1321 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1322 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1323 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1324 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1325 stats->tx_window_errors += dev_stats.tx_window_errors;
1330 /* Retrieves current device stats for 'netdev-tap' netdev or
1331 * netdev-internal. */
1333 netdev_pseudo_get_stats(const struct netdev *netdev_,
1334 struct netdev_stats *stats)
1336 struct netdev_dev_linux *netdev_dev =
1337 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1338 struct netdev_stats dev_stats;
1341 get_stats_via_vport(netdev_, stats);
1343 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1345 if (!netdev_dev->have_vport_stats) {
1352 /* If this port is an internal port then the transmit and receive stats
1353 * will appear to be swapped relative to the other ports since we are the
1354 * one sending the data, not a remote computer. For consistency, we swap
1355 * them back here. This does not apply if we are getting stats from the
1356 * vport layer because it always tracks stats from the perspective of the
1358 if (!netdev_dev->have_vport_stats) {
1360 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1361 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1362 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1363 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1364 stats->rx_length_errors = 0;
1365 stats->rx_over_errors = 0;
1366 stats->rx_crc_errors = 0;
1367 stats->rx_frame_errors = 0;
1368 stats->rx_fifo_errors = 0;
1369 stats->rx_missed_errors = 0;
1370 stats->tx_aborted_errors = 0;
1371 stats->tx_carrier_errors = 0;
1372 stats->tx_fifo_errors = 0;
1373 stats->tx_heartbeat_errors = 0;
1374 stats->tx_window_errors = 0;
1376 stats->rx_dropped += dev_stats.tx_dropped;
1377 stats->tx_dropped += dev_stats.rx_dropped;
1379 stats->rx_errors += dev_stats.tx_errors;
1380 stats->tx_errors += dev_stats.rx_errors;
1382 stats->multicast += dev_stats.multicast;
1383 stats->collisions += dev_stats.collisions;
1388 /* Stores the features supported by 'netdev' into each of '*current',
1389 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1390 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1391 * successful, otherwise a positive errno value. */
1393 netdev_linux_get_features(const struct netdev *netdev,
1394 uint32_t *current, uint32_t *advertised,
1395 uint32_t *supported, uint32_t *peer)
1397 struct ethtool_cmd ecmd;
1400 memset(&ecmd, 0, sizeof ecmd);
1401 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1402 ETHTOOL_GSET, "ETHTOOL_GSET");
1407 /* Supported features. */
1409 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1410 *supported |= OFPPF_10MB_HD;
1412 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1413 *supported |= OFPPF_10MB_FD;
1415 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1416 *supported |= OFPPF_100MB_HD;
1418 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1419 *supported |= OFPPF_100MB_FD;
1421 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1422 *supported |= OFPPF_1GB_HD;
1424 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1425 *supported |= OFPPF_1GB_FD;
1427 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1428 *supported |= OFPPF_10GB_FD;
1430 if (ecmd.supported & SUPPORTED_TP) {
1431 *supported |= OFPPF_COPPER;
1433 if (ecmd.supported & SUPPORTED_FIBRE) {
1434 *supported |= OFPPF_FIBER;
1436 if (ecmd.supported & SUPPORTED_Autoneg) {
1437 *supported |= OFPPF_AUTONEG;
1439 if (ecmd.supported & SUPPORTED_Pause) {
1440 *supported |= OFPPF_PAUSE;
1442 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1443 *supported |= OFPPF_PAUSE_ASYM;
1446 /* Advertised features. */
1448 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1449 *advertised |= OFPPF_10MB_HD;
1451 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1452 *advertised |= OFPPF_10MB_FD;
1454 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1455 *advertised |= OFPPF_100MB_HD;
1457 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1458 *advertised |= OFPPF_100MB_FD;
1460 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1461 *advertised |= OFPPF_1GB_HD;
1463 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1464 *advertised |= OFPPF_1GB_FD;
1466 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1467 *advertised |= OFPPF_10GB_FD;
1469 if (ecmd.advertising & ADVERTISED_TP) {
1470 *advertised |= OFPPF_COPPER;
1472 if (ecmd.advertising & ADVERTISED_FIBRE) {
1473 *advertised |= OFPPF_FIBER;
1475 if (ecmd.advertising & ADVERTISED_Autoneg) {
1476 *advertised |= OFPPF_AUTONEG;
1478 if (ecmd.advertising & ADVERTISED_Pause) {
1479 *advertised |= OFPPF_PAUSE;
1481 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1482 *advertised |= OFPPF_PAUSE_ASYM;
1485 /* Current settings. */
1486 if (ecmd.speed == SPEED_10) {
1487 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1488 } else if (ecmd.speed == SPEED_100) {
1489 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1490 } else if (ecmd.speed == SPEED_1000) {
1491 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1492 } else if (ecmd.speed == SPEED_10000) {
1493 *current = OFPPF_10GB_FD;
1498 if (ecmd.port == PORT_TP) {
1499 *current |= OFPPF_COPPER;
1500 } else if (ecmd.port == PORT_FIBRE) {
1501 *current |= OFPPF_FIBER;
1505 *current |= OFPPF_AUTONEG;
1508 /* Peer advertisements. */
1509 *peer = 0; /* XXX */
1514 /* Set the features advertised by 'netdev' to 'advertise'. */
1516 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1518 struct ethtool_cmd ecmd;
1521 memset(&ecmd, 0, sizeof ecmd);
1522 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1523 ETHTOOL_GSET, "ETHTOOL_GSET");
1528 ecmd.advertising = 0;
1529 if (advertise & OFPPF_10MB_HD) {
1530 ecmd.advertising |= ADVERTISED_10baseT_Half;
1532 if (advertise & OFPPF_10MB_FD) {
1533 ecmd.advertising |= ADVERTISED_10baseT_Full;
1535 if (advertise & OFPPF_100MB_HD) {
1536 ecmd.advertising |= ADVERTISED_100baseT_Half;
1538 if (advertise & OFPPF_100MB_FD) {
1539 ecmd.advertising |= ADVERTISED_100baseT_Full;
1541 if (advertise & OFPPF_1GB_HD) {
1542 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1544 if (advertise & OFPPF_1GB_FD) {
1545 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1547 if (advertise & OFPPF_10GB_FD) {
1548 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1550 if (advertise & OFPPF_COPPER) {
1551 ecmd.advertising |= ADVERTISED_TP;
1553 if (advertise & OFPPF_FIBER) {
1554 ecmd.advertising |= ADVERTISED_FIBRE;
1556 if (advertise & OFPPF_AUTONEG) {
1557 ecmd.advertising |= ADVERTISED_Autoneg;
1559 if (advertise & OFPPF_PAUSE) {
1560 ecmd.advertising |= ADVERTISED_Pause;
1562 if (advertise & OFPPF_PAUSE_ASYM) {
1563 ecmd.advertising |= ADVERTISED_Asym_Pause;
1565 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1566 ETHTOOL_SSET, "ETHTOOL_SSET");
1569 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1570 * successful, otherwise a positive errno value. */
1572 netdev_linux_set_policing(struct netdev *netdev,
1573 uint32_t kbits_rate, uint32_t kbits_burst)
1575 struct netdev_dev_linux *netdev_dev =
1576 netdev_dev_linux_cast(netdev_get_dev(netdev));
1577 const char *netdev_name = netdev_get_name(netdev);
1580 COVERAGE_INC(netdev_set_policing);
1582 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1583 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1584 : kbits_burst); /* Stick with user-specified value. */
1586 if (netdev_dev->cache_valid & VALID_POLICING
1587 && netdev_dev->kbits_rate == kbits_rate
1588 && netdev_dev->kbits_burst == kbits_burst) {
1589 /* Assume that settings haven't changed since we last set them. */
1593 /* Remove any existing ingress qdisc. */
1594 error = tc_add_del_ingress_qdisc(netdev, false);
1596 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1597 netdev_name, strerror(error));
1602 error = tc_add_del_ingress_qdisc(netdev, true);
1604 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1605 netdev_name, strerror(error));
1609 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1611 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1612 netdev_name, strerror(error));
1617 netdev_dev->kbits_rate = kbits_rate;
1618 netdev_dev->kbits_burst = kbits_burst;
1619 netdev_dev->cache_valid |= VALID_POLICING;
1625 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1628 const struct tc_ops **opsp;
1630 for (opsp = tcs; *opsp != NULL; opsp++) {
1631 const struct tc_ops *ops = *opsp;
1632 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1633 sset_add(types, ops->ovs_name);
1639 static const struct tc_ops *
1640 tc_lookup_ovs_name(const char *name)
1642 const struct tc_ops **opsp;
1644 for (opsp = tcs; *opsp != NULL; opsp++) {
1645 const struct tc_ops *ops = *opsp;
1646 if (!strcmp(name, ops->ovs_name)) {
1653 static const struct tc_ops *
1654 tc_lookup_linux_name(const char *name)
1656 const struct tc_ops **opsp;
1658 for (opsp = tcs; *opsp != NULL; opsp++) {
1659 const struct tc_ops *ops = *opsp;
1660 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1667 static struct tc_queue *
1668 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1671 struct netdev_dev_linux *netdev_dev =
1672 netdev_dev_linux_cast(netdev_get_dev(netdev));
1673 struct tc_queue *queue;
1675 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1676 if (queue->queue_id == queue_id) {
1683 static struct tc_queue *
1684 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1686 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1690 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1692 struct netdev_qos_capabilities *caps)
1694 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1698 caps->n_queues = ops->n_queues;
1703 netdev_linux_get_qos(const struct netdev *netdev,
1704 const char **typep, struct shash *details)
1706 struct netdev_dev_linux *netdev_dev =
1707 netdev_dev_linux_cast(netdev_get_dev(netdev));
1710 error = tc_query_qdisc(netdev);
1715 *typep = netdev_dev->tc->ops->ovs_name;
1716 return (netdev_dev->tc->ops->qdisc_get
1717 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1722 netdev_linux_set_qos(struct netdev *netdev,
1723 const char *type, const struct shash *details)
1725 struct netdev_dev_linux *netdev_dev =
1726 netdev_dev_linux_cast(netdev_get_dev(netdev));
1727 const struct tc_ops *new_ops;
1730 new_ops = tc_lookup_ovs_name(type);
1731 if (!new_ops || !new_ops->tc_install) {
1735 error = tc_query_qdisc(netdev);
1740 if (new_ops == netdev_dev->tc->ops) {
1741 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1743 /* Delete existing qdisc. */
1744 error = tc_del_qdisc(netdev);
1748 assert(netdev_dev->tc == NULL);
1750 /* Install new qdisc. */
1751 error = new_ops->tc_install(netdev, details);
1752 assert((error == 0) == (netdev_dev->tc != NULL));
1759 netdev_linux_get_queue(const struct netdev *netdev,
1760 unsigned int queue_id, struct shash *details)
1762 struct netdev_dev_linux *netdev_dev =
1763 netdev_dev_linux_cast(netdev_get_dev(netdev));
1766 error = tc_query_qdisc(netdev);
1770 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1772 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1778 netdev_linux_set_queue(struct netdev *netdev,
1779 unsigned int queue_id, const struct shash *details)
1781 struct netdev_dev_linux *netdev_dev =
1782 netdev_dev_linux_cast(netdev_get_dev(netdev));
1785 error = tc_query_qdisc(netdev);
1788 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1789 || !netdev_dev->tc->ops->class_set) {
1793 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1797 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1799 struct netdev_dev_linux *netdev_dev =
1800 netdev_dev_linux_cast(netdev_get_dev(netdev));
1803 error = tc_query_qdisc(netdev);
1806 } else if (!netdev_dev->tc->ops->class_delete) {
1809 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1811 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1817 netdev_linux_get_queue_stats(const struct netdev *netdev,
1818 unsigned int queue_id,
1819 struct netdev_queue_stats *stats)
1821 struct netdev_dev_linux *netdev_dev =
1822 netdev_dev_linux_cast(netdev_get_dev(netdev));
1825 error = tc_query_qdisc(netdev);
1828 } else if (!netdev_dev->tc->ops->class_get_stats) {
1831 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1833 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1839 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1841 struct ofpbuf request;
1842 struct tcmsg *tcmsg;
1844 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1848 tcmsg->tcm_parent = 0;
1849 nl_dump_start(dump, rtnl_sock, &request);
1850 ofpbuf_uninit(&request);
1855 netdev_linux_dump_queues(const struct netdev *netdev,
1856 netdev_dump_queues_cb *cb, void *aux)
1858 struct netdev_dev_linux *netdev_dev =
1859 netdev_dev_linux_cast(netdev_get_dev(netdev));
1860 struct tc_queue *queue;
1861 struct shash details;
1865 error = tc_query_qdisc(netdev);
1868 } else if (!netdev_dev->tc->ops->class_get) {
1873 shash_init(&details);
1874 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1875 shash_clear(&details);
1877 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1879 (*cb)(queue->queue_id, &details, aux);
1884 shash_destroy(&details);
1890 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1891 netdev_dump_queue_stats_cb *cb, void *aux)
1893 struct netdev_dev_linux *netdev_dev =
1894 netdev_dev_linux_cast(netdev_get_dev(netdev));
1895 struct nl_dump dump;
1900 error = tc_query_qdisc(netdev);
1903 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1908 if (!start_queue_dump(netdev, &dump)) {
1911 while (nl_dump_next(&dump, &msg)) {
1912 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1918 error = nl_dump_done(&dump);
1919 return error ? error : last_error;
1923 netdev_linux_get_in4(const struct netdev *netdev_,
1924 struct in_addr *address, struct in_addr *netmask)
1926 struct netdev_dev_linux *netdev_dev =
1927 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1929 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1932 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1933 SIOCGIFADDR, "SIOCGIFADDR");
1938 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1939 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1944 netdev_dev->cache_valid |= VALID_IN4;
1946 *address = netdev_dev->address;
1947 *netmask = netdev_dev->netmask;
1948 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1952 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1953 struct in_addr netmask)
1955 struct netdev_dev_linux *netdev_dev =
1956 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1959 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1961 netdev_dev->cache_valid |= VALID_IN4;
1962 netdev_dev->address = address;
1963 netdev_dev->netmask = netmask;
1964 if (address.s_addr != INADDR_ANY) {
1965 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1966 "SIOCSIFNETMASK", netmask);
1973 parse_if_inet6_line(const char *line,
1974 struct in6_addr *in6, char ifname[16 + 1])
1976 uint8_t *s6 = in6->s6_addr;
1977 #define X8 "%2"SCNx8
1979 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1980 "%*x %*x %*x %*x %16s\n",
1981 &s6[0], &s6[1], &s6[2], &s6[3],
1982 &s6[4], &s6[5], &s6[6], &s6[7],
1983 &s6[8], &s6[9], &s6[10], &s6[11],
1984 &s6[12], &s6[13], &s6[14], &s6[15],
1988 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1989 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1991 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1993 struct netdev_dev_linux *netdev_dev =
1994 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1995 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1999 netdev_dev->in6 = in6addr_any;
2001 file = fopen("/proc/net/if_inet6", "r");
2003 const char *name = netdev_get_name(netdev_);
2004 while (fgets(line, sizeof line, file)) {
2005 struct in6_addr in6_tmp;
2006 char ifname[16 + 1];
2007 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2008 && !strcmp(name, ifname))
2010 netdev_dev->in6 = in6_tmp;
2016 netdev_dev->cache_valid |= VALID_IN6;
2018 *in6 = netdev_dev->in6;
2023 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2025 struct sockaddr_in sin;
2026 memset(&sin, 0, sizeof sin);
2027 sin.sin_family = AF_INET;
2028 sin.sin_addr = addr;
2031 memset(sa, 0, sizeof *sa);
2032 memcpy(sa, &sin, sizeof sin);
2036 do_set_addr(struct netdev *netdev,
2037 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2040 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2041 make_in4_sockaddr(&ifr.ifr_addr, addr);
2043 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2047 /* Adds 'router' as a default IP gateway. */
2049 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2051 struct in_addr any = { INADDR_ANY };
2055 memset(&rt, 0, sizeof rt);
2056 make_in4_sockaddr(&rt.rt_dst, any);
2057 make_in4_sockaddr(&rt.rt_gateway, router);
2058 make_in4_sockaddr(&rt.rt_genmask, any);
2059 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2060 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2062 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2068 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2071 static const char fn[] = "/proc/net/route";
2076 *netdev_name = NULL;
2077 stream = fopen(fn, "r");
2078 if (stream == NULL) {
2079 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2084 while (fgets(line, sizeof line, stream)) {
2087 ovs_be32 dest, gateway, mask;
2088 int refcnt, metric, mtu;
2089 unsigned int flags, use, window, irtt;
2092 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2094 iface, &dest, &gateway, &flags, &refcnt,
2095 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2097 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2101 if (!(flags & RTF_UP)) {
2102 /* Skip routes that aren't up. */
2106 /* The output of 'dest', 'mask', and 'gateway' were given in
2107 * network byte order, so we don't need need any endian
2108 * conversions here. */
2109 if ((dest & mask) == (host->s_addr & mask)) {
2111 /* The host is directly reachable. */
2112 next_hop->s_addr = 0;
2114 /* To reach the host, we must go through a gateway. */
2115 next_hop->s_addr = gateway;
2117 *netdev_name = xstrdup(iface);
2129 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2131 struct ethtool_drvinfo drvinfo;
2134 memset(&drvinfo, 0, sizeof drvinfo);
2135 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2136 (struct ethtool_cmd *)&drvinfo,
2138 "ETHTOOL_GDRVINFO");
2140 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2141 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2142 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2148 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2149 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2150 * returns 0. Otherwise, it returns a positive errno value; in particular,
2151 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2153 netdev_linux_arp_lookup(const struct netdev *netdev,
2154 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2157 struct sockaddr_in sin;
2160 memset(&r, 0, sizeof r);
2161 memset(&sin, 0, sizeof sin);
2162 sin.sin_family = AF_INET;
2163 sin.sin_addr.s_addr = ip;
2165 memcpy(&r.arp_pa, &sin, sizeof sin);
2166 r.arp_ha.sa_family = ARPHRD_ETHER;
2168 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2169 COVERAGE_INC(netdev_arp_lookup);
2170 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2172 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2173 } else if (retval != ENXIO) {
2174 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2175 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2181 nd_to_iff_flags(enum netdev_flags nd)
2184 if (nd & NETDEV_UP) {
2187 if (nd & NETDEV_PROMISC) {
2194 iff_to_nd_flags(int iff)
2196 enum netdev_flags nd = 0;
2200 if (iff & IFF_PROMISC) {
2201 nd |= NETDEV_PROMISC;
2207 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2208 enum netdev_flags on, enum netdev_flags *old_flagsp)
2210 struct netdev_dev_linux *netdev_dev;
2211 int old_flags, new_flags;
2214 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2215 old_flags = netdev_dev->ifi_flags;
2216 *old_flagsp = iff_to_nd_flags(old_flags);
2217 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2218 if (new_flags != old_flags) {
2219 error = set_flags(netdev, new_flags);
2220 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
2226 netdev_linux_change_seq(const struct netdev *netdev)
2228 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2231 #define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2235 netdev_linux_init, \
2237 netdev_linux_wait, \
2240 netdev_linux_destroy, \
2241 NULL, /* get_config */ \
2242 NULL, /* set_config */ \
2244 netdev_linux_open, \
2245 netdev_linux_close, \
2247 netdev_linux_listen, \
2248 netdev_linux_recv, \
2249 netdev_linux_recv_wait, \
2250 netdev_linux_drain, \
2252 netdev_linux_send, \
2253 netdev_linux_send_wait, \
2255 netdev_linux_set_etheraddr, \
2256 netdev_linux_get_etheraddr, \
2257 netdev_linux_get_mtu, \
2258 netdev_linux_set_mtu, \
2259 netdev_linux_get_ifindex, \
2260 netdev_linux_get_carrier, \
2261 netdev_linux_get_carrier_resets, \
2262 netdev_linux_set_miimon_interval, \
2266 netdev_linux_get_features, \
2267 netdev_linux_set_advertisements, \
2269 netdev_linux_set_policing, \
2270 netdev_linux_get_qos_types, \
2271 netdev_linux_get_qos_capabilities, \
2272 netdev_linux_get_qos, \
2273 netdev_linux_set_qos, \
2274 netdev_linux_get_queue, \
2275 netdev_linux_set_queue, \
2276 netdev_linux_delete_queue, \
2277 netdev_linux_get_queue_stats, \
2278 netdev_linux_dump_queues, \
2279 netdev_linux_dump_queue_stats, \
2281 netdev_linux_get_in4, \
2282 netdev_linux_set_in4, \
2283 netdev_linux_get_in6, \
2284 netdev_linux_add_router, \
2285 netdev_linux_get_next_hop, \
2286 netdev_linux_get_status, \
2287 netdev_linux_arp_lookup, \
2289 netdev_linux_update_flags, \
2291 netdev_linux_change_seq \
2294 const struct netdev_class netdev_linux_class =
2297 netdev_linux_create,
2298 netdev_linux_get_stats,
2299 NULL); /* set_stats */
2301 const struct netdev_class netdev_tap_class =
2304 netdev_linux_create_tap,
2305 netdev_pseudo_get_stats,
2306 NULL); /* set_stats */
2308 const struct netdev_class netdev_internal_class =
2311 netdev_linux_create,
2312 netdev_pseudo_get_stats,
2313 netdev_vport_set_stats);
2315 /* HTB traffic control class. */
2317 #define HTB_N_QUEUES 0xf000
2321 unsigned int max_rate; /* In bytes/s. */
2325 struct tc_queue tc_queue;
2326 unsigned int min_rate; /* In bytes/s. */
2327 unsigned int max_rate; /* In bytes/s. */
2328 unsigned int burst; /* In bytes. */
2329 unsigned int priority; /* Lower values are higher priorities. */
2333 htb_get__(const struct netdev *netdev)
2335 struct netdev_dev_linux *netdev_dev =
2336 netdev_dev_linux_cast(netdev_get_dev(netdev));
2337 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2341 htb_install__(struct netdev *netdev, uint64_t max_rate)
2343 struct netdev_dev_linux *netdev_dev =
2344 netdev_dev_linux_cast(netdev_get_dev(netdev));
2347 htb = xmalloc(sizeof *htb);
2348 tc_init(&htb->tc, &tc_ops_htb);
2349 htb->max_rate = max_rate;
2351 netdev_dev->tc = &htb->tc;
2354 /* Create an HTB qdisc.
2356 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2358 htb_setup_qdisc__(struct netdev *netdev)
2361 struct tc_htb_glob opt;
2362 struct ofpbuf request;
2363 struct tcmsg *tcmsg;
2365 tc_del_qdisc(netdev);
2367 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2368 NLM_F_EXCL | NLM_F_CREATE, &request);
2372 tcmsg->tcm_handle = tc_make_handle(1, 0);
2373 tcmsg->tcm_parent = TC_H_ROOT;
2375 nl_msg_put_string(&request, TCA_KIND, "htb");
2377 memset(&opt, 0, sizeof opt);
2378 opt.rate2quantum = 10;
2382 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2383 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2384 nl_msg_end_nested(&request, opt_offset);
2386 return tc_transact(&request, NULL);
2389 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2390 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2392 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2393 unsigned int parent, struct htb_class *class)
2396 struct tc_htb_opt opt;
2397 struct ofpbuf request;
2398 struct tcmsg *tcmsg;
2402 error = netdev_get_mtu(netdev, &mtu);
2404 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2405 netdev_get_name(netdev));
2409 memset(&opt, 0, sizeof opt);
2410 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2411 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2412 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2413 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2414 opt.prio = class->priority;
2416 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2420 tcmsg->tcm_handle = handle;
2421 tcmsg->tcm_parent = parent;
2423 nl_msg_put_string(&request, TCA_KIND, "htb");
2424 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2425 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2426 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2427 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2428 nl_msg_end_nested(&request, opt_offset);
2430 error = tc_transact(&request, NULL);
2432 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2433 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2434 netdev_get_name(netdev),
2435 tc_get_major(handle), tc_get_minor(handle),
2436 tc_get_major(parent), tc_get_minor(parent),
2437 class->min_rate, class->max_rate,
2438 class->burst, class->priority, strerror(error));
2443 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2444 * description of them into 'details'. The description complies with the
2445 * specification given in the vswitch database documentation for linux-htb
2448 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2450 static const struct nl_policy tca_htb_policy[] = {
2451 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2452 .min_len = sizeof(struct tc_htb_opt) },
2455 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2456 const struct tc_htb_opt *htb;
2458 if (!nl_parse_nested(nl_options, tca_htb_policy,
2459 attrs, ARRAY_SIZE(tca_htb_policy))) {
2460 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2464 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2465 class->min_rate = htb->rate.rate;
2466 class->max_rate = htb->ceil.rate;
2467 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2468 class->priority = htb->prio;
2473 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2474 struct htb_class *options,
2475 struct netdev_queue_stats *stats)
2477 struct nlattr *nl_options;
2478 unsigned int handle;
2481 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2482 if (!error && queue_id) {
2483 unsigned int major = tc_get_major(handle);
2484 unsigned int minor = tc_get_minor(handle);
2485 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2486 *queue_id = minor - 1;
2491 if (!error && options) {
2492 error = htb_parse_tca_options__(nl_options, options);
2498 htb_parse_qdisc_details__(struct netdev *netdev,
2499 const struct shash *details, struct htb_class *hc)
2501 const char *max_rate_s;
2503 max_rate_s = shash_find_data(details, "max-rate");
2504 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2505 if (!hc->max_rate) {
2508 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2509 hc->max_rate = netdev_features_to_bps(current) / 8;
2511 hc->min_rate = hc->max_rate;
2517 htb_parse_class_details__(struct netdev *netdev,
2518 const struct shash *details, struct htb_class *hc)
2520 const struct htb *htb = htb_get__(netdev);
2521 const char *min_rate_s = shash_find_data(details, "min-rate");
2522 const char *max_rate_s = shash_find_data(details, "max-rate");
2523 const char *burst_s = shash_find_data(details, "burst");
2524 const char *priority_s = shash_find_data(details, "priority");
2527 error = netdev_get_mtu(netdev, &mtu);
2529 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2530 netdev_get_name(netdev));
2534 /* HTB requires at least an mtu sized min-rate to send any traffic even
2535 * on uncongested links. */
2536 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2537 hc->min_rate = MAX(hc->min_rate, mtu);
2538 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2541 hc->max_rate = (max_rate_s
2542 ? strtoull(max_rate_s, NULL, 10) / 8
2544 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2545 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2549 * According to hints in the documentation that I've read, it is important
2550 * that 'burst' be at least as big as the largest frame that might be
2551 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2552 * but having it a bit too small is a problem. Since netdev_get_mtu()
2553 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2554 * the MTU. We actually add 64, instead of 14, as a guard against
2555 * additional headers get tacked on somewhere that we're not aware of. */
2556 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2557 hc->burst = MAX(hc->burst, mtu + 64);
2560 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2566 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2567 unsigned int parent, struct htb_class *options,
2568 struct netdev_queue_stats *stats)
2570 struct ofpbuf *reply;
2573 error = tc_query_class(netdev, handle, parent, &reply);
2575 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2576 ofpbuf_delete(reply);
2582 htb_tc_install(struct netdev *netdev, const struct shash *details)
2586 error = htb_setup_qdisc__(netdev);
2588 struct htb_class hc;
2590 htb_parse_qdisc_details__(netdev, details, &hc);
2591 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2592 tc_make_handle(1, 0), &hc);
2594 htb_install__(netdev, hc.max_rate);
2600 static struct htb_class *
2601 htb_class_cast__(const struct tc_queue *queue)
2603 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2607 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2608 const struct htb_class *hc)
2610 struct htb *htb = htb_get__(netdev);
2611 size_t hash = hash_int(queue_id, 0);
2612 struct tc_queue *queue;
2613 struct htb_class *hcp;
2615 queue = tc_find_queue__(netdev, queue_id, hash);
2617 hcp = htb_class_cast__(queue);
2619 hcp = xmalloc(sizeof *hcp);
2620 queue = &hcp->tc_queue;
2621 queue->queue_id = queue_id;
2622 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2625 hcp->min_rate = hc->min_rate;
2626 hcp->max_rate = hc->max_rate;
2627 hcp->burst = hc->burst;
2628 hcp->priority = hc->priority;
2632 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2635 struct nl_dump dump;
2636 struct htb_class hc;
2638 /* Get qdisc options. */
2640 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2641 htb_install__(netdev, hc.max_rate);
2644 if (!start_queue_dump(netdev, &dump)) {
2647 while (nl_dump_next(&dump, &msg)) {
2648 unsigned int queue_id;
2650 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2651 htb_update_queue__(netdev, queue_id, &hc);
2654 nl_dump_done(&dump);
2660 htb_tc_destroy(struct tc *tc)
2662 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2663 struct htb_class *hc, *next;
2665 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2666 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2674 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2676 const struct htb *htb = htb_get__(netdev);
2677 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2682 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2684 struct htb_class hc;
2687 htb_parse_qdisc_details__(netdev, details, &hc);
2688 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2689 tc_make_handle(1, 0), &hc);
2691 htb_get__(netdev)->max_rate = hc.max_rate;
2697 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2698 const struct tc_queue *queue, struct shash *details)
2700 const struct htb_class *hc = htb_class_cast__(queue);
2702 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2703 if (hc->min_rate != hc->max_rate) {
2704 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2706 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2708 shash_add(details, "priority", xasprintf("%u", hc->priority));
2714 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2715 const struct shash *details)
2717 struct htb_class hc;
2720 error = htb_parse_class_details__(netdev, details, &hc);
2725 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2726 tc_make_handle(1, 0xfffe), &hc);
2731 htb_update_queue__(netdev, queue_id, &hc);
2736 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2738 struct htb_class *hc = htb_class_cast__(queue);
2739 struct htb *htb = htb_get__(netdev);
2742 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2744 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2751 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2752 struct netdev_queue_stats *stats)
2754 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2755 tc_make_handle(1, 0xfffe), NULL, stats);
2759 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2760 const struct ofpbuf *nlmsg,
2761 netdev_dump_queue_stats_cb *cb, void *aux)
2763 struct netdev_queue_stats stats;
2764 unsigned int handle, major, minor;
2767 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2772 major = tc_get_major(handle);
2773 minor = tc_get_minor(handle);
2774 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2775 (*cb)(minor - 1, &stats, aux);
2780 static const struct tc_ops tc_ops_htb = {
2781 "htb", /* linux_name */
2782 "linux-htb", /* ovs_name */
2783 HTB_N_QUEUES, /* n_queues */
2792 htb_class_get_stats,
2793 htb_class_dump_stats
2796 /* "linux-hfsc" traffic control class. */
2798 #define HFSC_N_QUEUES 0xf000
2806 struct tc_queue tc_queue;
2811 static struct hfsc *
2812 hfsc_get__(const struct netdev *netdev)
2814 struct netdev_dev_linux *netdev_dev;
2815 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2816 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2819 static struct hfsc_class *
2820 hfsc_class_cast__(const struct tc_queue *queue)
2822 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2826 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2828 struct netdev_dev_linux * netdev_dev;
2831 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2832 hfsc = xmalloc(sizeof *hfsc);
2833 tc_init(&hfsc->tc, &tc_ops_hfsc);
2834 hfsc->max_rate = max_rate;
2835 netdev_dev->tc = &hfsc->tc;
2839 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2840 const struct hfsc_class *hc)
2844 struct hfsc_class *hcp;
2845 struct tc_queue *queue;
2847 hfsc = hfsc_get__(netdev);
2848 hash = hash_int(queue_id, 0);
2850 queue = tc_find_queue__(netdev, queue_id, hash);
2852 hcp = hfsc_class_cast__(queue);
2854 hcp = xmalloc(sizeof *hcp);
2855 queue = &hcp->tc_queue;
2856 queue->queue_id = queue_id;
2857 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2860 hcp->min_rate = hc->min_rate;
2861 hcp->max_rate = hc->max_rate;
2865 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2867 const struct tc_service_curve *rsc, *fsc, *usc;
2868 static const struct nl_policy tca_hfsc_policy[] = {
2870 .type = NL_A_UNSPEC,
2872 .min_len = sizeof(struct tc_service_curve),
2875 .type = NL_A_UNSPEC,
2877 .min_len = sizeof(struct tc_service_curve),
2880 .type = NL_A_UNSPEC,
2882 .min_len = sizeof(struct tc_service_curve),
2885 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2887 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2888 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2889 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2893 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2894 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2895 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2897 if (rsc->m1 != 0 || rsc->d != 0 ||
2898 fsc->m1 != 0 || fsc->d != 0 ||
2899 usc->m1 != 0 || usc->d != 0) {
2900 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2901 "Non-linear service curves are not supported.");
2905 if (rsc->m2 != fsc->m2) {
2906 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2907 "Real-time service curves are not supported ");
2911 if (rsc->m2 > usc->m2) {
2912 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2913 "Min-rate service curve is greater than "
2914 "the max-rate service curve.");
2918 class->min_rate = fsc->m2;
2919 class->max_rate = usc->m2;
2924 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2925 struct hfsc_class *options,
2926 struct netdev_queue_stats *stats)
2929 unsigned int handle;
2930 struct nlattr *nl_options;
2932 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2938 unsigned int major, minor;
2940 major = tc_get_major(handle);
2941 minor = tc_get_minor(handle);
2942 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2943 *queue_id = minor - 1;
2950 error = hfsc_parse_tca_options__(nl_options, options);
2957 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2958 unsigned int parent, struct hfsc_class *options,
2959 struct netdev_queue_stats *stats)
2962 struct ofpbuf *reply;
2964 error = tc_query_class(netdev, handle, parent, &reply);
2969 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2970 ofpbuf_delete(reply);
2975 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2976 struct hfsc_class *class)
2979 const char *max_rate_s;
2981 max_rate_s = shash_find_data(details, "max-rate");
2982 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2987 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2988 max_rate = netdev_features_to_bps(current) / 8;
2991 class->min_rate = max_rate;
2992 class->max_rate = max_rate;
2996 hfsc_parse_class_details__(struct netdev *netdev,
2997 const struct shash *details,
2998 struct hfsc_class * class)
3000 const struct hfsc *hfsc;
3001 uint32_t min_rate, max_rate;
3002 const char *min_rate_s, *max_rate_s;
3004 hfsc = hfsc_get__(netdev);
3005 min_rate_s = shash_find_data(details, "min-rate");
3006 max_rate_s = shash_find_data(details, "max-rate");
3008 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3009 min_rate = MAX(min_rate, 1);
3010 min_rate = MIN(min_rate, hfsc->max_rate);
3012 max_rate = (max_rate_s
3013 ? strtoull(max_rate_s, NULL, 10) / 8
3015 max_rate = MAX(max_rate, min_rate);
3016 max_rate = MIN(max_rate, hfsc->max_rate);
3018 class->min_rate = min_rate;
3019 class->max_rate = max_rate;
3024 /* Create an HFSC qdisc.
3026 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3028 hfsc_setup_qdisc__(struct netdev * netdev)
3030 struct tcmsg *tcmsg;
3031 struct ofpbuf request;
3032 struct tc_hfsc_qopt opt;
3034 tc_del_qdisc(netdev);
3036 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3037 NLM_F_EXCL | NLM_F_CREATE, &request);
3043 tcmsg->tcm_handle = tc_make_handle(1, 0);
3044 tcmsg->tcm_parent = TC_H_ROOT;
3046 memset(&opt, 0, sizeof opt);
3049 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3050 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3052 return tc_transact(&request, NULL);
3055 /* Create an HFSC class.
3057 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3058 * sc rate <min_rate> ul rate <max_rate>" */
3060 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3061 unsigned int parent, struct hfsc_class *class)
3065 struct tcmsg *tcmsg;
3066 struct ofpbuf request;
3067 struct tc_service_curve min, max;
3069 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3075 tcmsg->tcm_handle = handle;
3076 tcmsg->tcm_parent = parent;
3080 min.m2 = class->min_rate;
3084 max.m2 = class->max_rate;
3086 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3087 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3088 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3089 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3090 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3091 nl_msg_end_nested(&request, opt_offset);
3093 error = tc_transact(&request, NULL);
3095 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3096 "min-rate %ubps, max-rate %ubps (%s)",
3097 netdev_get_name(netdev),
3098 tc_get_major(handle), tc_get_minor(handle),
3099 tc_get_major(parent), tc_get_minor(parent),
3100 class->min_rate, class->max_rate, strerror(error));
3107 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3110 struct hfsc_class class;
3112 error = hfsc_setup_qdisc__(netdev);
3118 hfsc_parse_qdisc_details__(netdev, details, &class);
3119 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3120 tc_make_handle(1, 0), &class);
3126 hfsc_install__(netdev, class.max_rate);
3131 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3134 struct nl_dump dump;
3135 struct hfsc_class hc;
3138 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3139 hfsc_install__(netdev, hc.max_rate);
3141 if (!start_queue_dump(netdev, &dump)) {
3145 while (nl_dump_next(&dump, &msg)) {
3146 unsigned int queue_id;
3148 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3149 hfsc_update_queue__(netdev, queue_id, &hc);
3153 nl_dump_done(&dump);
3158 hfsc_tc_destroy(struct tc *tc)
3161 struct hfsc_class *hc, *next;
3163 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3165 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3166 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3175 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3177 const struct hfsc *hfsc;
3178 hfsc = hfsc_get__(netdev);
3179 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3184 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3187 struct hfsc_class class;
3189 hfsc_parse_qdisc_details__(netdev, details, &class);
3190 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3191 tc_make_handle(1, 0), &class);
3194 hfsc_get__(netdev)->max_rate = class.max_rate;
3201 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3202 const struct tc_queue *queue, struct shash *details)
3204 const struct hfsc_class *hc;
3206 hc = hfsc_class_cast__(queue);
3207 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3208 if (hc->min_rate != hc->max_rate) {
3209 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3215 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3216 const struct shash *details)
3219 struct hfsc_class class;
3221 error = hfsc_parse_class_details__(netdev, details, &class);
3226 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3227 tc_make_handle(1, 0xfffe), &class);
3232 hfsc_update_queue__(netdev, queue_id, &class);
3237 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3241 struct hfsc_class *hc;
3243 hc = hfsc_class_cast__(queue);
3244 hfsc = hfsc_get__(netdev);
3246 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3248 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3255 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3256 struct netdev_queue_stats *stats)
3258 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3259 tc_make_handle(1, 0xfffe), NULL, stats);
3263 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3264 const struct ofpbuf *nlmsg,
3265 netdev_dump_queue_stats_cb *cb, void *aux)
3267 struct netdev_queue_stats stats;
3268 unsigned int handle, major, minor;
3271 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3276 major = tc_get_major(handle);
3277 minor = tc_get_minor(handle);
3278 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3279 (*cb)(minor - 1, &stats, aux);
3284 static const struct tc_ops tc_ops_hfsc = {
3285 "hfsc", /* linux_name */
3286 "linux-hfsc", /* ovs_name */
3287 HFSC_N_QUEUES, /* n_queues */
3288 hfsc_tc_install, /* tc_install */
3289 hfsc_tc_load, /* tc_load */
3290 hfsc_tc_destroy, /* tc_destroy */
3291 hfsc_qdisc_get, /* qdisc_get */
3292 hfsc_qdisc_set, /* qdisc_set */
3293 hfsc_class_get, /* class_get */
3294 hfsc_class_set, /* class_set */
3295 hfsc_class_delete, /* class_delete */
3296 hfsc_class_get_stats, /* class_get_stats */
3297 hfsc_class_dump_stats /* class_dump_stats */
3300 /* "linux-default" traffic control class.
3302 * This class represents the default, unnamed Linux qdisc. It corresponds to
3303 * the "" (empty string) QoS type in the OVS database. */
3306 default_install__(struct netdev *netdev)
3308 struct netdev_dev_linux *netdev_dev =
3309 netdev_dev_linux_cast(netdev_get_dev(netdev));
3310 static struct tc *tc;
3313 tc = xmalloc(sizeof *tc);
3314 tc_init(tc, &tc_ops_default);
3316 netdev_dev->tc = tc;
3320 default_tc_install(struct netdev *netdev,
3321 const struct shash *details OVS_UNUSED)
3323 default_install__(netdev);
3328 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3330 default_install__(netdev);
3334 static const struct tc_ops tc_ops_default = {
3335 NULL, /* linux_name */
3340 NULL, /* tc_destroy */
3341 NULL, /* qdisc_get */
3342 NULL, /* qdisc_set */
3343 NULL, /* class_get */
3344 NULL, /* class_set */
3345 NULL, /* class_delete */
3346 NULL, /* class_get_stats */
3347 NULL /* class_dump_stats */
3350 /* "linux-other" traffic control class.
3355 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3357 struct netdev_dev_linux *netdev_dev =
3358 netdev_dev_linux_cast(netdev_get_dev(netdev));
3359 static struct tc *tc;
3362 tc = xmalloc(sizeof *tc);
3363 tc_init(tc, &tc_ops_other);
3365 netdev_dev->tc = tc;
3369 static const struct tc_ops tc_ops_other = {
3370 NULL, /* linux_name */
3371 "linux-other", /* ovs_name */
3373 NULL, /* tc_install */
3375 NULL, /* tc_destroy */
3376 NULL, /* qdisc_get */
3377 NULL, /* qdisc_set */
3378 NULL, /* class_get */
3379 NULL, /* class_set */
3380 NULL, /* class_delete */
3381 NULL, /* class_get_stats */
3382 NULL /* class_dump_stats */
3385 /* Traffic control. */
3387 /* Number of kernel "tc" ticks per second. */
3388 static double ticks_per_s;
3390 /* Number of kernel "jiffies" per second. This is used for the purpose of
3391 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3392 * one jiffy's worth of data.
3394 * There are two possibilities here:
3396 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3397 * approximate range of 100 to 1024. That means that we really need to
3398 * make sure that the qdisc can buffer that much data.
3400 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3401 * has finely granular timers and there's no need to fudge additional room
3402 * for buffers. (There's no extra effort needed to implement that: the
3403 * large 'buffer_hz' is used as a divisor, so practically any number will
3404 * come out as 0 in the division. Small integer results in the case of
3405 * really high dividends won't have any real effect anyhow.)
3407 static unsigned int buffer_hz;
3409 /* Returns tc handle 'major':'minor'. */
3411 tc_make_handle(unsigned int major, unsigned int minor)
3413 return TC_H_MAKE(major << 16, minor);
3416 /* Returns the major number from 'handle'. */
3418 tc_get_major(unsigned int handle)
3420 return TC_H_MAJ(handle) >> 16;
3423 /* Returns the minor number from 'handle'. */
3425 tc_get_minor(unsigned int handle)
3427 return TC_H_MIN(handle);
3430 static struct tcmsg *
3431 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3432 struct ofpbuf *request)
3434 struct tcmsg *tcmsg;
3438 error = get_ifindex(netdev, &ifindex);
3443 ofpbuf_init(request, 512);
3444 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3445 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3446 tcmsg->tcm_family = AF_UNSPEC;
3447 tcmsg->tcm_ifindex = ifindex;
3448 /* Caller should fill in tcmsg->tcm_handle. */
3449 /* Caller should fill in tcmsg->tcm_parent. */
3455 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3457 int error = nl_sock_transact(rtnl_sock, request, replyp);
3458 ofpbuf_uninit(request);
3462 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3463 * policing configuration.
3465 * This function is equivalent to running the following when 'add' is true:
3466 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3468 * This function is equivalent to running the following when 'add' is false:
3469 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3471 * The configuration and stats may be seen with the following command:
3472 * /sbin/tc -s qdisc show dev <devname>
3474 * Returns 0 if successful, otherwise a positive errno value.
3477 tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3479 struct ofpbuf request;
3480 struct tcmsg *tcmsg;
3482 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3483 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3485 tcmsg = tc_make_request(netdev, type, flags, &request);
3489 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3490 tcmsg->tcm_parent = TC_H_INGRESS;
3491 nl_msg_put_string(&request, TCA_KIND, "ingress");
3492 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3494 error = tc_transact(&request, NULL);
3496 /* If we're deleting the qdisc, don't worry about some of the
3497 * error conditions. */
3498 if (!add && (error == ENOENT || error == EINVAL)) {
3507 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3510 * This function is equivalent to running:
3511 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3512 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3515 * The configuration and stats may be seen with the following command:
3516 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3518 * Returns 0 if successful, otherwise a positive errno value.
3521 tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3523 struct tc_police tc_police;
3524 struct ofpbuf request;
3525 struct tcmsg *tcmsg;
3526 size_t basic_offset;
3527 size_t police_offset;
3531 memset(&tc_police, 0, sizeof tc_police);
3532 tc_police.action = TC_POLICE_SHOT;
3533 tc_police.mtu = mtu;
3534 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3535 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3536 kbits_burst * 1024);
3538 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3539 NLM_F_EXCL | NLM_F_CREATE, &request);
3543 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3544 tcmsg->tcm_info = tc_make_handle(49,
3545 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3547 nl_msg_put_string(&request, TCA_KIND, "basic");
3548 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3549 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3550 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3551 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3552 nl_msg_end_nested(&request, police_offset);
3553 nl_msg_end_nested(&request, basic_offset);
3555 error = tc_transact(&request, NULL);
3566 /* The values in psched are not individually very meaningful, but they are
3567 * important. The tables below show some values seen in the wild.
3571 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3572 * (Before that, there are hints that it was 1000000000.)
3574 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3578 * -----------------------------------
3579 * [1] 000c8000 000f4240 000f4240 00000064
3580 * [2] 000003e8 00000400 000f4240 3b9aca00
3581 * [3] 000003e8 00000400 000f4240 3b9aca00
3582 * [4] 000003e8 00000400 000f4240 00000064
3583 * [5] 000003e8 00000040 000f4240 3b9aca00
3584 * [6] 000003e8 00000040 000f4240 000000f9
3586 * a b c d ticks_per_s buffer_hz
3587 * ------- --------- ---------- ------------- ----------- -------------
3588 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3589 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3590 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3591 * [4] 1,000 1,024 1,000,000 100 976,562 100
3592 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3593 * [6] 1,000 64 1,000,000 249 15,625,000 249
3595 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3596 * [2] 2.6.26-1-686-bigmem from Debian lenny
3597 * [3] 2.6.26-2-sparc64 from Debian lenny
3598 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3599 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3600 * [6] 2.6.34 from kernel.org on KVM
3602 static const char fn[] = "/proc/net/psched";
3603 unsigned int a, b, c, d;
3609 stream = fopen(fn, "r");
3611 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3615 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3616 VLOG_WARN("%s: read failed", fn);
3620 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3624 VLOG_WARN("%s: invalid scheduler parameters", fn);
3628 ticks_per_s = (double) a * c / b;
3632 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3635 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3638 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3639 * rate of 'rate' bytes per second. */
3641 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3646 return (rate * ticks) / ticks_per_s;
3649 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3650 * rate of 'rate' bytes per second. */
3652 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3657 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3660 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3661 * a transmission rate of 'rate' bytes per second. */
3663 tc_buffer_per_jiffy(unsigned int rate)
3668 return rate / buffer_hz;
3671 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3672 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3673 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3674 * stores NULL into it if it is absent.
3676 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3679 * Returns 0 if successful, otherwise a positive errno value. */
3681 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3682 struct nlattr **options)
3684 static const struct nl_policy tca_policy[] = {
3685 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3686 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3688 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3690 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3691 tca_policy, ta, ARRAY_SIZE(ta))) {
3692 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3697 *kind = nl_attr_get_string(ta[TCA_KIND]);
3701 *options = ta[TCA_OPTIONS];
3716 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3717 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3718 * into '*options', and its queue statistics into '*stats'. Any of the output
3719 * arguments may be null.
3721 * Returns 0 if successful, otherwise a positive errno value. */
3723 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3724 struct nlattr **options, struct netdev_queue_stats *stats)
3726 static const struct nl_policy tca_policy[] = {
3727 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3728 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3730 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3732 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3733 tca_policy, ta, ARRAY_SIZE(ta))) {
3734 VLOG_WARN_RL(&rl, "failed to parse class message");
3739 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3740 *handlep = tc->tcm_handle;
3744 *options = ta[TCA_OPTIONS];
3748 const struct gnet_stats_queue *gsq;
3749 struct gnet_stats_basic gsb;
3751 static const struct nl_policy stats_policy[] = {
3752 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3753 .min_len = sizeof gsb },
3754 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3755 .min_len = sizeof *gsq },
3757 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3759 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3760 sa, ARRAY_SIZE(sa))) {
3761 VLOG_WARN_RL(&rl, "failed to parse class stats");
3765 /* Alignment issues screw up the length of struct gnet_stats_basic on
3766 * some arch/bitsize combinations. Newer versions of Linux have a
3767 * struct gnet_stats_basic_packed, but we can't depend on that. The
3768 * easiest thing to do is just to make a copy. */
3769 memset(&gsb, 0, sizeof gsb);
3770 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3771 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3772 stats->tx_bytes = gsb.bytes;
3773 stats->tx_packets = gsb.packets;
3775 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3776 stats->tx_errors = gsq->drops;
3786 memset(stats, 0, sizeof *stats);
3791 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3794 tc_query_class(const struct netdev *netdev,
3795 unsigned int handle, unsigned int parent,
3796 struct ofpbuf **replyp)
3798 struct ofpbuf request;
3799 struct tcmsg *tcmsg;
3802 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3806 tcmsg->tcm_handle = handle;
3807 tcmsg->tcm_parent = parent;
3809 error = tc_transact(&request, replyp);
3811 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3812 netdev_get_name(netdev),
3813 tc_get_major(handle), tc_get_minor(handle),
3814 tc_get_major(parent), tc_get_minor(parent),
3820 /* Equivalent to "tc class del dev <name> handle <handle>". */
3822 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3824 struct ofpbuf request;
3825 struct tcmsg *tcmsg;
3828 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3832 tcmsg->tcm_handle = handle;
3833 tcmsg->tcm_parent = 0;
3835 error = tc_transact(&request, NULL);
3837 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3838 netdev_get_name(netdev),
3839 tc_get_major(handle), tc_get_minor(handle),
3845 /* Equivalent to "tc qdisc del dev <name> root". */
3847 tc_del_qdisc(struct netdev *netdev)
3849 struct netdev_dev_linux *netdev_dev =
3850 netdev_dev_linux_cast(netdev_get_dev(netdev));
3851 struct ofpbuf request;
3852 struct tcmsg *tcmsg;
3855 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3859 tcmsg->tcm_handle = tc_make_handle(1, 0);
3860 tcmsg->tcm_parent = TC_H_ROOT;
3862 error = tc_transact(&request, NULL);
3863 if (error == EINVAL) {
3864 /* EINVAL probably means that the default qdisc was in use, in which
3865 * case we've accomplished our purpose. */
3868 if (!error && netdev_dev->tc) {
3869 if (netdev_dev->tc->ops->tc_destroy) {
3870 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3872 netdev_dev->tc = NULL;
3877 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3878 * kernel to determine what they are. Returns 0 if successful, otherwise a
3879 * positive errno value. */
3881 tc_query_qdisc(const struct netdev *netdev)
3883 struct netdev_dev_linux *netdev_dev =
3884 netdev_dev_linux_cast(netdev_get_dev(netdev));
3885 struct ofpbuf request, *qdisc;
3886 const struct tc_ops *ops;
3887 struct tcmsg *tcmsg;
3891 if (netdev_dev->tc) {
3895 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3896 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3897 * 2.6.35 without that fix backported to it.
3899 * To avoid the OOPS, we must not make a request that would attempt to dump
3900 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3901 * few others. There are a few ways that I can see to do this, but most of
3902 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3903 * technique chosen here is to assume that any non-default qdisc that we
3904 * create will have a class with handle 1:0. The built-in qdiscs only have
3905 * a class with handle 0:0.
3907 * We could check for Linux 2.6.35+ and use a more straightforward method
3909 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3913 tcmsg->tcm_handle = tc_make_handle(1, 0);
3914 tcmsg->tcm_parent = 0;
3916 /* Figure out what tc class to instantiate. */
3917 error = tc_transact(&request, &qdisc);
3921 error = tc_parse_qdisc(qdisc, &kind, NULL);
3923 ops = &tc_ops_other;
3925 ops = tc_lookup_linux_name(kind);
3927 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3928 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3930 ops = &tc_ops_other;
3933 } else if (error == ENOENT) {
3934 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3935 * other entity that doesn't have a handle 1:0. We will assume
3936 * that it's the system default qdisc. */
3937 ops = &tc_ops_default;
3940 /* Who knows? Maybe the device got deleted. */
3941 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3942 netdev_get_name(netdev), strerror(error));
3943 ops = &tc_ops_other;
3946 /* Instantiate it. */
3947 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3948 assert((load_error == 0) == (netdev_dev->tc != NULL));
3949 ofpbuf_delete(qdisc);
3951 return error ? error : load_error;
3954 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3955 approximate the time to transmit packets of various lengths. For an MTU of
3956 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3957 represents two possible packet lengths; for a MTU of 513 through 1024, four
3958 possible lengths; and so on.
3960 Returns, for the specified 'mtu', the number of bits that packet lengths
3961 need to be shifted right to fit within such a 256-entry table. */
3963 tc_calc_cell_log(unsigned int mtu)
3968 mtu = ETH_PAYLOAD_MAX;
3970 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3972 for (cell_log = 0; mtu >= 256; cell_log++) {
3979 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3982 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3984 memset(rate, 0, sizeof *rate);
3985 rate->cell_log = tc_calc_cell_log(mtu);
3986 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3987 /* rate->cell_align = 0; */ /* distro headers. */
3988 rate->mpu = ETH_TOTAL_MIN;
3992 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3993 * attribute of the specified "type".
3995 * See tc_calc_cell_log() above for a description of "rtab"s. */
3997 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4002 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4003 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4004 unsigned packet_size = (i + 1) << rate->cell_log;
4005 if (packet_size < rate->mpu) {
4006 packet_size = rate->mpu;
4008 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4012 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4013 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4014 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4017 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4019 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4020 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4023 /* Linux-only functions declared in netdev-linux.h */
4025 /* Returns a fd for an AF_INET socket or a negative errno value. */
4027 netdev_linux_get_af_inet_sock(void)
4029 int error = netdev_linux_init();
4030 return error ? -error : af_inet_sock;
4033 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4034 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4036 netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4037 const char *flag_name, bool enable)
4039 const char *netdev_name = netdev_get_name(netdev);
4040 struct ethtool_value evalue;
4044 memset(&evalue, 0, sizeof evalue);
4045 error = netdev_linux_do_ethtool(netdev_name,
4046 (struct ethtool_cmd *)&evalue,
4047 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4052 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4053 error = netdev_linux_do_ethtool(netdev_name,
4054 (struct ethtool_cmd *)&evalue,
4055 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4060 memset(&evalue, 0, sizeof evalue);
4061 error = netdev_linux_do_ethtool(netdev_name,
4062 (struct ethtool_cmd *)&evalue,
4063 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4068 if (new_flags != evalue.data) {
4069 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4070 "device %s failed", enable ? "enable" : "disable",
4071 flag_name, netdev_name);
4078 /* Utility functions. */
4080 /* Copies 'src' into 'dst', performing format conversion in the process. */
4082 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4083 const struct rtnl_link_stats *src)
4085 dst->rx_packets = src->rx_packets;
4086 dst->tx_packets = src->tx_packets;
4087 dst->rx_bytes = src->rx_bytes;
4088 dst->tx_bytes = src->tx_bytes;
4089 dst->rx_errors = src->rx_errors;
4090 dst->tx_errors = src->tx_errors;
4091 dst->rx_dropped = src->rx_dropped;
4092 dst->tx_dropped = src->tx_dropped;
4093 dst->multicast = src->multicast;
4094 dst->collisions = src->collisions;
4095 dst->rx_length_errors = src->rx_length_errors;
4096 dst->rx_over_errors = src->rx_over_errors;
4097 dst->rx_crc_errors = src->rx_crc_errors;
4098 dst->rx_frame_errors = src->rx_frame_errors;
4099 dst->rx_fifo_errors = src->rx_fifo_errors;
4100 dst->rx_missed_errors = src->rx_missed_errors;
4101 dst->tx_aborted_errors = src->tx_aborted_errors;
4102 dst->tx_carrier_errors = src->tx_carrier_errors;
4103 dst->tx_fifo_errors = src->tx_fifo_errors;
4104 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4105 dst->tx_window_errors = src->tx_window_errors;
4109 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4111 /* Policy for RTNLGRP_LINK messages.
4113 * There are *many* more fields in these messages, but currently we only
4114 * care about these fields. */
4115 static const struct nl_policy rtnlgrp_link_policy[] = {
4116 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4117 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4118 .min_len = sizeof(struct rtnl_link_stats) },
4121 struct ofpbuf request;
4122 struct ofpbuf *reply;
4123 struct ifinfomsg *ifi;
4124 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4127 ofpbuf_init(&request, 0);
4128 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4129 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4130 ifi->ifi_family = PF_UNSPEC;
4131 ifi->ifi_index = ifindex;
4132 error = nl_sock_transact(rtnl_sock, &request, &reply);
4133 ofpbuf_uninit(&request);
4138 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4139 rtnlgrp_link_policy,
4140 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4141 ofpbuf_delete(reply);
4145 if (!attrs[IFLA_STATS]) {
4146 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4147 ofpbuf_delete(reply);
4151 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4153 ofpbuf_delete(reply);
4159 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4161 static const char fn[] = "/proc/net/dev";
4166 stream = fopen(fn, "r");
4168 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4173 while (fgets(line, sizeof line, stream)) {
4176 #define X64 "%"SCNu64
4179 X64 X64 X64 X64 X64 X64 X64 "%*u"
4180 X64 X64 X64 X64 X64 X64 X64 "%*u",
4186 &stats->rx_fifo_errors,
4187 &stats->rx_frame_errors,
4193 &stats->tx_fifo_errors,
4195 &stats->tx_carrier_errors) != 15) {
4196 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4197 } else if (!strcmp(devname, netdev_name)) {
4198 stats->rx_length_errors = UINT64_MAX;
4199 stats->rx_over_errors = UINT64_MAX;
4200 stats->rx_crc_errors = UINT64_MAX;
4201 stats->rx_missed_errors = UINT64_MAX;
4202 stats->tx_aborted_errors = UINT64_MAX;
4203 stats->tx_heartbeat_errors = UINT64_MAX;
4204 stats->tx_window_errors = UINT64_MAX;
4210 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4216 get_flags(const struct netdev_dev *dev, unsigned int *flags)
4222 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
4225 *flags = ifr.ifr_flags;
4231 set_flags(struct netdev *netdev, unsigned int flags)
4235 ifr.ifr_flags = flags;
4236 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4241 do_get_ifindex(const char *netdev_name)
4245 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4246 COVERAGE_INC(netdev_get_ifindex);
4247 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4248 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4249 netdev_name, strerror(errno));
4252 return ifr.ifr_ifindex;
4256 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4258 struct netdev_dev_linux *netdev_dev =
4259 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4261 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4262 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4266 netdev_dev->cache_valid |= VALID_IFINDEX;
4267 netdev_dev->ifindex = ifindex;
4269 *ifindexp = netdev_dev->ifindex;
4274 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4279 memset(&ifr, 0, sizeof ifr);
4280 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4281 COVERAGE_INC(netdev_get_hwaddr);
4282 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4283 /* ENODEV probably means that a vif disappeared asynchronously and
4284 * hasn't been removed from the database yet, so reduce the log level
4285 * to INFO for that case. */
4286 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4287 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4288 netdev_name, strerror(errno));
4291 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4292 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4293 VLOG_WARN("%s device has unknown hardware address family %d",
4294 netdev_name, hwaddr_family);
4296 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4301 set_etheraddr(const char *netdev_name, int hwaddr_family,
4302 const uint8_t mac[ETH_ADDR_LEN])
4306 memset(&ifr, 0, sizeof ifr);
4307 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4308 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4309 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4310 COVERAGE_INC(netdev_set_hwaddr);
4311 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4312 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4313 netdev_name, strerror(errno));
4320 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4321 int cmd, const char *cmd_name)
4325 memset(&ifr, 0, sizeof ifr);
4326 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4327 ifr.ifr_data = (caddr_t) ecmd;
4330 COVERAGE_INC(netdev_ethtool);
4331 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4334 if (errno != EOPNOTSUPP) {
4335 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4336 "failed: %s", cmd_name, name, strerror(errno));
4338 /* The device doesn't support this operation. That's pretty
4339 * common, so there's no point in logging anything. */
4346 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4347 const char *cmd_name)
4349 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4350 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4351 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4359 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4360 int cmd, const char *cmd_name)
4365 ifr.ifr_addr.sa_family = AF_INET;
4366 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4368 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4369 *ip = sin->sin_addr;
4374 /* Returns an AF_PACKET raw socket or a negative errno value. */
4376 af_packet_sock(void)
4378 static int sock = INT_MIN;
4380 if (sock == INT_MIN) {
4381 sock = socket(AF_PACKET, SOCK_RAW, 0);
4383 set_nonblocking(sock);
4386 VLOG_ERR("failed to create packet socket: %s", strerror(errno));