2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
24 #include <arpa/inet.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_sched.h>
33 #include <linux/rtnetlink.h>
34 #include <linux/sockios.h>
35 #include <linux/version.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
40 #include <net/ethernet.h>
42 #include <linux/if_tunnel.h>
43 #include <net/if_arp.h>
44 #include <net/if_packet.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
53 #include "dpif-linux.h"
54 #include "dynamic-string.h"
55 #include "fatal-signal.h"
58 #include "netdev-provider.h"
59 #include "netdev-vport.h"
61 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
65 #include "poll-loop.h"
66 #include "rtnetlink.h"
67 #include "rtnetlink-link.h"
68 #include "socket-util.h"
74 VLOG_DEFINE_THIS_MODULE(netdev_linux);
76 COVERAGE_DEFINE(netdev_get_vlan_vid);
77 COVERAGE_DEFINE(netdev_set_policing);
78 COVERAGE_DEFINE(netdev_arp_lookup);
79 COVERAGE_DEFINE(netdev_get_ifindex);
80 COVERAGE_DEFINE(netdev_get_hwaddr);
81 COVERAGE_DEFINE(netdev_set_hwaddr);
82 COVERAGE_DEFINE(netdev_ethtool);
84 /* These were introduced in Linux 2.6.14, so they might be missing if we have
86 #ifndef ADVERTISED_Pause
87 #define ADVERTISED_Pause (1 << 13)
89 #ifndef ADVERTISED_Asym_Pause
90 #define ADVERTISED_Asym_Pause (1 << 14)
93 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
96 #define TC_RTAB_SIZE 1024
99 static struct rtnetlink_notifier netdev_linux_cache_notifier;
100 static int cache_notifier_refcount;
103 VALID_IFINDEX = 1 << 0,
104 VALID_ETHERADDR = 1 << 1,
108 VALID_CARRIER = 1 << 5,
109 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
110 VALID_POLICING = 1 << 7,
111 VALID_HAVE_VPORT_STATS = 1 << 8
119 /* Traffic control. */
121 /* An instance of a traffic control class. Always associated with a particular
124 * Each TC implementation subclasses this with whatever additional data it
127 const struct tc_ops *ops;
128 struct hmap queues; /* Contains "struct tc_queue"s.
129 * Read by generic TC layer.
130 * Written only by TC implementation. */
133 /* One traffic control queue.
135 * Each TC implementation subclasses this with whatever additional data it
138 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
139 unsigned int queue_id; /* OpenFlow queue ID. */
142 /* A particular kind of traffic control. Each implementation generally maps to
143 * one particular Linux qdisc class.
145 * The functions below return 0 if successful or a positive errno value on
146 * failure, except where otherwise noted. All of them must be provided, except
147 * where otherwise noted. */
149 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
150 * This is null for tc_ops_default and tc_ops_other, for which there are no
151 * appropriate values. */
152 const char *linux_name;
154 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
155 const char *ovs_name;
157 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
158 * queues. The queues are numbered 0 through n_queues - 1. */
159 unsigned int n_queues;
161 /* Called to install this TC class on 'netdev'. The implementation should
162 * make the Netlink calls required to set up 'netdev' with the right qdisc
163 * and configure it according to 'details'. The implementation may assume
164 * that the current qdisc is the default; that is, there is no need for it
165 * to delete the current qdisc before installing itself.
167 * The contents of 'details' should be documented as valid for 'ovs_name'
168 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
169 * (which is built as ovs-vswitchd.conf.db(8)).
171 * This function must return 0 if and only if it sets 'netdev->tc' to an
172 * initialized 'struct tc'.
174 * (This function is null for tc_ops_other, which cannot be installed. For
175 * other TC classes it should always be nonnull.) */
176 int (*tc_install)(struct netdev *netdev, const struct shash *details);
178 /* Called when the netdev code determines (through a Netlink query) that
179 * this TC class's qdisc is installed on 'netdev', but we didn't install
180 * it ourselves and so don't know any of the details.
182 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
183 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
184 * implementation should parse the other attributes of 'nlmsg' as
185 * necessary to determine its configuration. If necessary it should also
186 * use Netlink queries to determine the configuration of queues on
189 * This function must return 0 if and only if it sets 'netdev->tc' to an
190 * initialized 'struct tc'. */
191 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
193 /* Destroys the data structures allocated by the implementation as part of
194 * 'tc'. (This includes destroying 'tc->queues' by calling
197 * The implementation should not need to perform any Netlink calls. If
198 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
199 * (But it may not be desirable.)
201 * This function may be null if 'tc' is trivial. */
202 void (*tc_destroy)(struct tc *tc);
204 /* Retrieves details of 'netdev->tc' configuration into 'details'.
206 * The implementation should not need to perform any Netlink calls, because
207 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
208 * cached the configuration.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
212 * (which is built as ovs-vswitchd.conf.db(8)).
214 * This function may be null if 'tc' is not configurable.
216 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
218 /* Reconfigures 'netdev->tc' according to 'details', performing any
219 * required Netlink calls to complete the reconfiguration.
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
225 * This function may be null if 'tc' is not configurable.
227 int (*qdisc_set)(struct netdev *, const struct shash *details);
229 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
230 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "Queue" table in
234 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
236 * The implementation should not need to perform any Netlink calls, because
237 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
238 * cached the queue configuration.
240 * This function may be null if 'tc' does not have queues ('n_queues' is
242 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
243 struct shash *details);
245 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
246 * 'details', perfoming any required Netlink calls to complete the
247 * reconfiguration. The caller ensures that 'queue_id' is less than
250 * The contents of 'details' should be documented as valid for 'ovs_name'
251 * in the "other_config" column in the "Queue" table in
252 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
254 * This function may be null if 'tc' does not have queues or its queues are
255 * not configurable. */
256 int (*class_set)(struct netdev *, unsigned int queue_id,
257 const struct shash *details);
259 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
260 * tc_queue's within 'netdev->tc->queues'.
262 * This function may be null if 'tc' does not have queues or its queues
263 * cannot be deleted. */
264 int (*class_delete)(struct netdev *, struct tc_queue *queue);
266 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
267 * 'struct tc_queue's within 'netdev->tc->queues'.
269 * On success, initializes '*stats'.
271 * This function may be null if 'tc' does not have queues or if it cannot
272 * report queue statistics. */
273 int (*class_get_stats)(const struct netdev *netdev,
274 const struct tc_queue *queue,
275 struct netdev_queue_stats *stats);
277 /* Extracts queue stats from 'nlmsg', which is a response to a
278 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
282 int (*class_dump_stats)(const struct netdev *netdev,
283 const struct ofpbuf *nlmsg,
284 netdev_dump_queue_stats_cb *cb, void *aux);
288 tc_init(struct tc *tc, const struct tc_ops *ops)
291 hmap_init(&tc->queues);
295 tc_destroy(struct tc *tc)
297 hmap_destroy(&tc->queues);
300 static const struct tc_ops tc_ops_htb;
301 static const struct tc_ops tc_ops_hfsc;
302 static const struct tc_ops tc_ops_default;
303 static const struct tc_ops tc_ops_other;
305 static const struct tc_ops *tcs[] = {
306 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
307 &tc_ops_hfsc, /* Hierarchical fair service curve. */
308 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
309 &tc_ops_other, /* Some other qdisc. */
313 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
314 static unsigned int tc_get_major(unsigned int handle);
315 static unsigned int tc_get_minor(unsigned int handle);
317 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
318 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
319 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
321 static struct tcmsg *tc_make_request(const struct netdev *, int type,
322 unsigned int flags, struct ofpbuf *);
323 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
325 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
326 struct nlattr **options);
327 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
328 struct nlattr **options,
329 struct netdev_queue_stats *);
330 static int tc_query_class(const struct netdev *,
331 unsigned int handle, unsigned int parent,
332 struct ofpbuf **replyp);
333 static int tc_delete_class(const struct netdev *, unsigned int handle);
335 static int tc_del_qdisc(struct netdev *netdev);
336 static int tc_query_qdisc(const struct netdev *netdev);
338 static int tc_calc_cell_log(unsigned int mtu);
339 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
340 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
341 const struct tc_ratespec *rate);
342 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
344 struct netdev_dev_linux {
345 struct netdev_dev netdev_dev;
347 struct shash_node *shash_node;
348 unsigned int cache_valid;
349 unsigned int change_seq;
351 bool miimon; /* Link status of last poll. */
352 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
353 struct timer miimon_timer;
355 /* The following are figured out "on demand" only. They are only valid
356 * when the corresponding VALID_* bit in 'cache_valid' is set. */
358 uint8_t etheraddr[ETH_ADDR_LEN];
359 struct in_addr address, netmask;
363 bool is_internal; /* Is this an openvswitch internal device? */
364 bool is_tap; /* Is this a tuntap device? */
365 uint32_t kbits_rate; /* Policing data. */
366 uint32_t kbits_burst;
367 bool have_vport_stats;
371 struct tap_state tap;
375 struct netdev_linux {
376 struct netdev netdev;
380 /* Sockets used for ioctl operations. */
381 static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
383 /* A Netlink routing socket that is not subscribed to any multicast groups. */
384 static struct nl_sock *rtnl_sock;
386 /* This is set pretty low because we probably won't learn anything from the
387 * additional log messages. */
388 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
390 static int netdev_linux_init(void);
392 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
393 int cmd, const char *cmd_name);
394 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
395 const char *cmd_name);
396 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
397 int cmd, const char *cmd_name);
398 static int get_flags(const struct netdev *, int *flagsp);
399 static int set_flags(struct netdev *, int flags);
400 static int do_get_ifindex(const char *netdev_name);
401 static int get_ifindex(const struct netdev *, int *ifindexp);
402 static int do_set_addr(struct netdev *netdev,
403 int ioctl_nr, const char *ioctl_name,
404 struct in_addr addr);
405 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
406 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
407 const uint8_t[ETH_ADDR_LEN]);
408 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
409 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
410 static int af_packet_sock(void);
411 static void netdev_linux_miimon_run(void);
412 static void netdev_linux_miimon_wait(void);
415 is_netdev_linux_class(const struct netdev_class *netdev_class)
417 return netdev_class->init == netdev_linux_init;
420 static struct netdev_dev_linux *
421 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
423 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
424 assert(is_netdev_linux_class(netdev_class));
426 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
429 static struct netdev_linux *
430 netdev_linux_cast(const struct netdev *netdev)
432 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
433 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
434 assert(is_netdev_linux_class(netdev_class));
436 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
440 netdev_linux_init(void)
442 static int status = -1;
444 /* Create AF_INET socket. */
445 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
446 status = af_inet_sock >= 0 ? 0 : errno;
448 VLOG_ERR("failed to create inet socket: %s", strerror(status));
451 /* Create rtnetlink socket. */
453 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
455 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
464 netdev_linux_run(void)
466 rtnetlink_link_notifier_run();
467 netdev_linux_miimon_run();
471 netdev_linux_wait(void)
473 rtnetlink_link_notifier_wait();
474 netdev_linux_miimon_wait();
478 netdev_dev_linux_changed(struct netdev_dev_linux *dev)
481 if (!dev->change_seq) {
484 dev->cache_valid = 0;
488 netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
489 void *aux OVS_UNUSED)
491 struct netdev_dev_linux *dev;
493 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
495 const struct netdev_class *netdev_class =
496 netdev_dev_get_class(base_dev);
498 if (is_netdev_linux_class(netdev_class)) {
499 dev = netdev_dev_linux_cast(base_dev);
500 netdev_dev_linux_changed(dev);
504 struct shash device_shash;
505 struct shash_node *node;
507 shash_init(&device_shash);
508 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
509 SHASH_FOR_EACH (node, &device_shash) {
511 netdev_dev_linux_changed(dev);
513 shash_destroy(&device_shash);
517 /* Creates system and internal devices. */
519 netdev_linux_create(const struct netdev_class *class, const char *name,
520 struct netdev_dev **netdev_devp)
522 struct netdev_dev_linux *netdev_dev;
525 if (!cache_notifier_refcount) {
526 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
527 netdev_linux_cache_cb, NULL);
532 cache_notifier_refcount++;
534 netdev_dev = xzalloc(sizeof *netdev_dev);
535 netdev_dev->change_seq = 1;
536 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
538 *netdev_devp = &netdev_dev->netdev_dev;
542 /* For most types of netdevs we open the device for each call of
543 * netdev_open(). However, this is not the case with tap devices,
544 * since it is only possible to open the device once. In this
545 * situation we share a single file descriptor, and consequently
546 * buffers, across all readers. Therefore once data is read it will
547 * be unavailable to other reads for tap devices. */
549 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
550 const char *name, struct netdev_dev **netdev_devp)
552 struct netdev_dev_linux *netdev_dev;
553 struct tap_state *state;
554 static const char tap_dev[] = "/dev/net/tun";
558 netdev_dev = xzalloc(sizeof *netdev_dev);
559 state = &netdev_dev->state.tap;
561 /* Open tap device. */
562 state->fd = open(tap_dev, O_RDWR);
565 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
569 /* Create tap device. */
570 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
571 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
572 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
573 VLOG_WARN("%s: creating tap device failed: %s", name,
579 /* Make non-blocking. */
580 error = set_nonblocking(state->fd);
585 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
586 *netdev_devp = &netdev_dev->netdev_dev;
595 destroy_tap(struct netdev_dev_linux *netdev_dev)
597 struct tap_state *state = &netdev_dev->state.tap;
599 if (state->fd >= 0) {
604 /* Destroys the netdev device 'netdev_dev_'. */
606 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
608 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
609 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
611 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
612 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
615 if (class == &netdev_linux_class || class == &netdev_internal_class) {
616 cache_notifier_refcount--;
618 if (!cache_notifier_refcount) {
619 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
621 } else if (class == &netdev_tap_class) {
622 destroy_tap(netdev_dev);
631 netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
633 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
634 struct netdev_linux *netdev;
635 enum netdev_flags flags;
638 /* Allocate network device. */
639 netdev = xzalloc(sizeof *netdev);
641 netdev_init(&netdev->netdev, netdev_dev_);
643 /* Verify that the device really exists, by attempting to read its flags.
644 * (The flags might be cached, in which case this won't actually do an
647 * Don't do this for "internal" netdevs, though, because those have to be
648 * created as netdev objects before they exist in the kernel, because
649 * creating them in the kernel happens by passing a netdev object to
650 * dpif_port_add(). */
651 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
652 error = netdev_get_flags(&netdev->netdev, &flags);
653 if (error == ENODEV) {
658 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
659 !netdev_dev->state.tap.opened) {
661 /* We assume that the first user of the tap device is the primary user
662 * and give them the tap FD. Subsequent users probably just expect
663 * this to be a system device so open it normally to avoid send/receive
664 * directions appearing to be reversed. */
665 netdev->fd = netdev_dev->state.tap.fd;
666 netdev_dev->state.tap.opened = true;
669 *netdevp = &netdev->netdev;
673 netdev_uninit(&netdev->netdev, true);
677 /* Closes and destroys 'netdev'. */
679 netdev_linux_close(struct netdev *netdev_)
681 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
683 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
689 /* Initializes 'sset' with a list of the names of all known network devices. */
691 netdev_linux_enumerate(struct sset *sset)
693 struct if_nameindex *names;
695 names = if_nameindex();
699 for (i = 0; names[i].if_name != NULL; i++) {
700 sset_add(sset, names[i].if_name);
702 if_freenameindex(names);
705 VLOG_WARN("could not obtain list of network device names: %s",
712 netdev_linux_listen(struct netdev *netdev_)
714 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
715 struct sockaddr_ll sll;
720 if (netdev->fd >= 0) {
724 /* Create file descriptor. */
725 fd = socket(PF_PACKET, SOCK_RAW, 0);
728 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
732 /* Set non-blocking mode. */
733 error = set_nonblocking(fd);
738 /* Get ethernet device index. */
739 error = get_ifindex(&netdev->netdev, &ifindex);
744 /* Bind to specific ethernet device. */
745 memset(&sll, 0, sizeof sll);
746 sll.sll_family = AF_PACKET;
747 sll.sll_ifindex = ifindex;
748 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
749 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
751 VLOG_ERR("%s: failed to bind raw socket (%s)",
752 netdev_get_name(netdev_), strerror(error));
767 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
769 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
771 if (netdev->fd < 0) {
772 /* Device is not listening. */
777 ssize_t retval = read(netdev->fd, data, size);
780 } else if (errno != EINTR) {
781 if (errno != EAGAIN) {
782 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
783 strerror(errno), netdev_get_name(netdev_));
790 /* Registers with the poll loop to wake up from the next call to poll_block()
791 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
793 netdev_linux_recv_wait(struct netdev *netdev_)
795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796 if (netdev->fd >= 0) {
797 poll_fd_wait(netdev->fd, POLLIN);
801 /* Discards all packets waiting to be received from 'netdev'. */
803 netdev_linux_drain(struct netdev *netdev_)
805 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
806 if (netdev->fd < 0) {
808 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
810 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
811 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
815 drain_fd(netdev->fd, ifr.ifr_qlen);
818 return drain_rcvbuf(netdev->fd);
822 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
823 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
824 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
825 * the packet is too big or too small to transmit on the device.
827 * The caller retains ownership of 'buffer' in all cases.
829 * The kernel maintains a packet transmission queue, so the caller is not
830 * expected to do additional queuing of packets. */
832 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
834 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
838 if (netdev->fd < 0) {
839 /* Use our AF_PACKET socket to send to this device. */
840 struct sockaddr_ll sll;
847 sock = af_packet_sock();
852 error = get_ifindex(netdev_, &ifindex);
857 /* We don't bother setting most fields in sockaddr_ll because the
858 * kernel ignores them for SOCK_RAW. */
859 memset(&sll, 0, sizeof sll);
860 sll.sll_family = AF_PACKET;
861 sll.sll_ifindex = ifindex;
863 iov.iov_base = (void *) data;
867 msg.msg_namelen = sizeof sll;
870 msg.msg_control = NULL;
871 msg.msg_controllen = 0;
874 retval = sendmsg(sock, &msg, 0);
876 /* Use the netdev's own fd to send to this device. This is
877 * essential for tap devices, because packets sent to a tap device
878 * with an AF_PACKET socket will loop back to be *received* again
879 * on the tap device. */
880 retval = write(netdev->fd, data, size);
884 /* The Linux AF_PACKET implementation never blocks waiting for room
885 * for packets, instead returning ENOBUFS. Translate this into
886 * EAGAIN for the caller. */
887 if (errno == ENOBUFS) {
889 } else if (errno == EINTR) {
891 } else if (errno != EAGAIN) {
892 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
893 netdev_get_name(netdev_), strerror(errno));
896 } else if (retval != size) {
897 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
898 "%zu) on %s", retval, size, netdev_get_name(netdev_));
906 /* Registers with the poll loop to wake up from the next call to poll_block()
907 * when the packet transmission queue has sufficient room to transmit a packet
908 * with netdev_send().
910 * The kernel maintains a packet transmission queue, so the client is not
911 * expected to do additional queuing of packets. Thus, this function is
912 * unlikely to ever be used. It is included for completeness. */
914 netdev_linux_send_wait(struct netdev *netdev_)
916 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
917 if (netdev->fd < 0) {
919 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
920 poll_fd_wait(netdev->fd, POLLOUT);
922 /* TAP device always accepts packets.*/
923 poll_immediate_wake();
927 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
928 * otherwise a positive errno value. */
930 netdev_linux_set_etheraddr(struct netdev *netdev_,
931 const uint8_t mac[ETH_ADDR_LEN])
933 struct netdev_dev_linux *netdev_dev =
934 netdev_dev_linux_cast(netdev_get_dev(netdev_));
937 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
938 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
939 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
941 netdev_dev->cache_valid |= VALID_ETHERADDR;
942 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
950 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
951 * free the returned buffer. */
953 netdev_linux_get_etheraddr(const struct netdev *netdev_,
954 uint8_t mac[ETH_ADDR_LEN])
956 struct netdev_dev_linux *netdev_dev =
957 netdev_dev_linux_cast(netdev_get_dev(netdev_));
958 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
959 int error = get_etheraddr(netdev_get_name(netdev_),
960 netdev_dev->etheraddr);
964 netdev_dev->cache_valid |= VALID_ETHERADDR;
966 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
970 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
971 * in bytes, not including the hardware header; thus, this is typically 1500
972 * bytes for Ethernet devices. */
974 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
976 struct netdev_dev_linux *netdev_dev =
977 netdev_dev_linux_cast(netdev_get_dev(netdev_));
978 if (!(netdev_dev->cache_valid & VALID_MTU)) {
982 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
983 SIOCGIFMTU, "SIOCGIFMTU");
987 netdev_dev->mtu = ifr.ifr_mtu;
988 netdev_dev->cache_valid |= VALID_MTU;
990 *mtup = netdev_dev->mtu;
994 /* Returns the ifindex of 'netdev', if successful, as a positive number.
995 * On failure, returns a negative errno value. */
997 netdev_linux_get_ifindex(const struct netdev *netdev)
1001 error = get_ifindex(netdev, &ifindex);
1002 return error ? -error : ifindex;
1006 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1008 struct netdev_dev_linux *netdev_dev =
1009 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1014 if (netdev_dev->miimon_interval > 0) {
1015 *carrier = netdev_dev->miimon;
1019 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
1023 fn = xasprintf("/sys/class/net/%s/carrier",
1024 netdev_get_name(netdev_));
1025 fd = open(fn, O_RDONLY);
1028 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
1032 retval = read(fd, line, sizeof line);
1035 if (error == EINVAL) {
1036 /* This is the normal return value when we try to check carrier
1037 * if the network device is not up. */
1039 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
1042 } else if (retval == 0) {
1044 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
1048 if (line[0] != '0' && line[0] != '1') {
1050 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
1054 netdev_dev->carrier = line[0] != '0';
1055 netdev_dev->cache_valid |= VALID_CARRIER;
1057 *carrier = netdev_dev->carrier;
1069 netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1070 struct mii_ioctl_data *data)
1075 memset(&ifr, 0, sizeof ifr);
1076 memcpy(&ifr.ifr_data, data, sizeof *data);
1077 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1078 memcpy(data, &ifr.ifr_data, sizeof *data);
1084 netdev_linux_get_miimon(const char *name, bool *miimon)
1086 struct mii_ioctl_data data;
1091 memset(&data, 0, sizeof data);
1092 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1094 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1095 data.reg_num = MII_BMSR;
1096 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1100 *miimon = !!(data.val_out & BMSR_LSTATUS);
1102 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1105 struct ethtool_cmd ecmd;
1107 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1110 memset(&ecmd, 0, sizeof ecmd);
1111 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1114 struct ethtool_value eval;
1116 memcpy(&eval, &ecmd, sizeof eval);
1117 *miimon = !!eval.data;
1119 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1127 netdev_linux_set_miimon_interval(struct netdev *netdev_,
1128 long long int interval)
1130 struct netdev_dev_linux *netdev_dev;
1132 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1134 interval = interval > 0 ? MAX(interval, 100) : 0;
1135 if (netdev_dev->miimon_interval != interval) {
1136 netdev_dev->miimon_interval = interval;
1137 timer_set_expired(&netdev_dev->miimon_timer);
1144 netdev_linux_miimon_run(void)
1146 struct shash device_shash;
1147 struct shash_node *node;
1149 shash_init(&device_shash);
1150 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1151 SHASH_FOR_EACH (node, &device_shash) {
1152 struct netdev_dev_linux *dev = node->data;
1155 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1159 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1160 if (miimon != dev->miimon) {
1161 dev->miimon = miimon;
1162 netdev_dev_linux_changed(dev);
1165 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1168 shash_destroy(&device_shash);
1172 netdev_linux_miimon_wait(void)
1174 struct shash device_shash;
1175 struct shash_node *node;
1177 shash_init(&device_shash);
1178 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1179 SHASH_FOR_EACH (node, &device_shash) {
1180 struct netdev_dev_linux *dev = node->data;
1182 if (dev->miimon_interval > 0) {
1183 timer_wait(&dev->miimon_timer);
1186 shash_destroy(&device_shash);
1189 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1190 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1193 check_for_working_netlink_stats(void)
1195 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1196 * preferable, so if that works, we'll use it. */
1197 int ifindex = do_get_ifindex("lo");
1199 VLOG_WARN("failed to get ifindex for lo, "
1200 "obtaining netdev stats from proc");
1203 struct netdev_stats stats;
1204 int error = get_stats_via_netlink(ifindex, &stats);
1206 VLOG_DBG("obtaining netdev stats via rtnetlink");
1209 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1210 "via proc (you are probably running a pre-2.6.19 "
1211 "kernel)", strerror(error));
1217 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1219 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1221 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1222 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1223 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1225 netdev_dev->is_tap = !strcmp(type, "tap");
1226 netdev_dev->is_internal = (!netdev_dev->is_tap
1227 && dpif_linux_is_internal_device(name));
1228 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1233 swap_uint64(uint64_t *a, uint64_t *b)
1240 /* Retrieves current device stats for 'netdev'. */
1242 netdev_linux_get_stats(const struct netdev *netdev_,
1243 struct netdev_stats *stats)
1245 struct netdev_dev_linux *netdev_dev =
1246 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1247 static int use_netlink_stats = -1;
1250 if (netdev_dev->have_vport_stats ||
1251 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1253 error = netdev_vport_get_stats(netdev_, stats);
1254 netdev_dev->have_vport_stats = !error;
1255 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1258 if (!netdev_dev->have_vport_stats) {
1259 if (use_netlink_stats < 0) {
1260 use_netlink_stats = check_for_working_netlink_stats();
1262 if (use_netlink_stats) {
1265 error = get_ifindex(netdev_, &ifindex);
1267 error = get_stats_via_netlink(ifindex, stats);
1270 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1274 /* If this port is an internal port then the transmit and receive stats
1275 * will appear to be swapped relative to the other ports since we are the
1276 * one sending the data, not a remote computer. For consistency, we swap
1277 * them back here. This does not apply if we are getting stats from the
1278 * vport layer because it always tracks stats from the perspective of the
1280 netdev_linux_update_is_pseudo(netdev_dev);
1281 if (!error && !netdev_dev->have_vport_stats &&
1282 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1283 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1284 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1285 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1286 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1287 stats->rx_length_errors = 0;
1288 stats->rx_over_errors = 0;
1289 stats->rx_crc_errors = 0;
1290 stats->rx_frame_errors = 0;
1291 stats->rx_fifo_errors = 0;
1292 stats->rx_missed_errors = 0;
1293 stats->tx_aborted_errors = 0;
1294 stats->tx_carrier_errors = 0;
1295 stats->tx_fifo_errors = 0;
1296 stats->tx_heartbeat_errors = 0;
1297 stats->tx_window_errors = 0;
1303 /* Stores the features supported by 'netdev' into each of '*current',
1304 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1305 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1306 * successful, otherwise a positive errno value. */
1308 netdev_linux_get_features(const struct netdev *netdev,
1309 uint32_t *current, uint32_t *advertised,
1310 uint32_t *supported, uint32_t *peer)
1312 struct ethtool_cmd ecmd;
1315 memset(&ecmd, 0, sizeof ecmd);
1316 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1317 ETHTOOL_GSET, "ETHTOOL_GSET");
1322 /* Supported features. */
1324 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1325 *supported |= OFPPF_10MB_HD;
1327 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1328 *supported |= OFPPF_10MB_FD;
1330 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1331 *supported |= OFPPF_100MB_HD;
1333 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1334 *supported |= OFPPF_100MB_FD;
1336 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1337 *supported |= OFPPF_1GB_HD;
1339 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1340 *supported |= OFPPF_1GB_FD;
1342 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1343 *supported |= OFPPF_10GB_FD;
1345 if (ecmd.supported & SUPPORTED_TP) {
1346 *supported |= OFPPF_COPPER;
1348 if (ecmd.supported & SUPPORTED_FIBRE) {
1349 *supported |= OFPPF_FIBER;
1351 if (ecmd.supported & SUPPORTED_Autoneg) {
1352 *supported |= OFPPF_AUTONEG;
1354 if (ecmd.supported & SUPPORTED_Pause) {
1355 *supported |= OFPPF_PAUSE;
1357 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1358 *supported |= OFPPF_PAUSE_ASYM;
1361 /* Advertised features. */
1363 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1364 *advertised |= OFPPF_10MB_HD;
1366 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1367 *advertised |= OFPPF_10MB_FD;
1369 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1370 *advertised |= OFPPF_100MB_HD;
1372 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1373 *advertised |= OFPPF_100MB_FD;
1375 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1376 *advertised |= OFPPF_1GB_HD;
1378 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1379 *advertised |= OFPPF_1GB_FD;
1381 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1382 *advertised |= OFPPF_10GB_FD;
1384 if (ecmd.advertising & ADVERTISED_TP) {
1385 *advertised |= OFPPF_COPPER;
1387 if (ecmd.advertising & ADVERTISED_FIBRE) {
1388 *advertised |= OFPPF_FIBER;
1390 if (ecmd.advertising & ADVERTISED_Autoneg) {
1391 *advertised |= OFPPF_AUTONEG;
1393 if (ecmd.advertising & ADVERTISED_Pause) {
1394 *advertised |= OFPPF_PAUSE;
1396 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1397 *advertised |= OFPPF_PAUSE_ASYM;
1400 /* Current settings. */
1401 if (ecmd.speed == SPEED_10) {
1402 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1403 } else if (ecmd.speed == SPEED_100) {
1404 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1405 } else if (ecmd.speed == SPEED_1000) {
1406 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1407 } else if (ecmd.speed == SPEED_10000) {
1408 *current = OFPPF_10GB_FD;
1413 if (ecmd.port == PORT_TP) {
1414 *current |= OFPPF_COPPER;
1415 } else if (ecmd.port == PORT_FIBRE) {
1416 *current |= OFPPF_FIBER;
1420 *current |= OFPPF_AUTONEG;
1423 /* Peer advertisements. */
1424 *peer = 0; /* XXX */
1429 /* Set the features advertised by 'netdev' to 'advertise'. */
1431 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1433 struct ethtool_cmd ecmd;
1436 memset(&ecmd, 0, sizeof ecmd);
1437 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1438 ETHTOOL_GSET, "ETHTOOL_GSET");
1443 ecmd.advertising = 0;
1444 if (advertise & OFPPF_10MB_HD) {
1445 ecmd.advertising |= ADVERTISED_10baseT_Half;
1447 if (advertise & OFPPF_10MB_FD) {
1448 ecmd.advertising |= ADVERTISED_10baseT_Full;
1450 if (advertise & OFPPF_100MB_HD) {
1451 ecmd.advertising |= ADVERTISED_100baseT_Half;
1453 if (advertise & OFPPF_100MB_FD) {
1454 ecmd.advertising |= ADVERTISED_100baseT_Full;
1456 if (advertise & OFPPF_1GB_HD) {
1457 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1459 if (advertise & OFPPF_1GB_FD) {
1460 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1462 if (advertise & OFPPF_10GB_FD) {
1463 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1465 if (advertise & OFPPF_COPPER) {
1466 ecmd.advertising |= ADVERTISED_TP;
1468 if (advertise & OFPPF_FIBER) {
1469 ecmd.advertising |= ADVERTISED_FIBRE;
1471 if (advertise & OFPPF_AUTONEG) {
1472 ecmd.advertising |= ADVERTISED_Autoneg;
1474 if (advertise & OFPPF_PAUSE) {
1475 ecmd.advertising |= ADVERTISED_Pause;
1477 if (advertise & OFPPF_PAUSE_ASYM) {
1478 ecmd.advertising |= ADVERTISED_Asym_Pause;
1480 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1481 ETHTOOL_SSET, "ETHTOOL_SSET");
1484 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1485 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1486 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1487 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1488 * sets '*vlan_vid' to -1. */
1490 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1492 const char *netdev_name = netdev_get_name(netdev);
1493 struct ds line = DS_EMPTY_INITIALIZER;
1494 FILE *stream = NULL;
1498 COVERAGE_INC(netdev_get_vlan_vid);
1499 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1500 stream = fopen(fn, "r");
1506 if (ds_get_line(&line, stream)) {
1507 if (ferror(stream)) {
1509 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1512 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1517 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1519 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1520 fn, ds_cstr(&line));
1538 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1539 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1541 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1542 * positive errno value.
1544 * This function is equivalent to running
1545 * /sbin/tc qdisc del dev %s handle ffff: ingress
1546 * but it is much, much faster.
1549 netdev_linux_remove_policing(struct netdev *netdev)
1551 struct netdev_dev_linux *netdev_dev =
1552 netdev_dev_linux_cast(netdev_get_dev(netdev));
1553 const char *netdev_name = netdev_get_name(netdev);
1555 struct ofpbuf request;
1556 struct tcmsg *tcmsg;
1559 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1563 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1564 tcmsg->tcm_parent = TC_H_INGRESS;
1565 nl_msg_put_string(&request, TCA_KIND, "ingress");
1566 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1568 error = tc_transact(&request, NULL);
1569 if (error && error != ENOENT && error != EINVAL) {
1570 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1571 netdev_name, strerror(error));
1575 netdev_dev->kbits_rate = 0;
1576 netdev_dev->kbits_burst = 0;
1577 netdev_dev->cache_valid |= VALID_POLICING;
1581 /* Attempts to set input rate limiting (policing) policy. */
1583 netdev_linux_set_policing(struct netdev *netdev,
1584 uint32_t kbits_rate, uint32_t kbits_burst)
1586 struct netdev_dev_linux *netdev_dev =
1587 netdev_dev_linux_cast(netdev_get_dev(netdev));
1588 const char *netdev_name = netdev_get_name(netdev);
1591 COVERAGE_INC(netdev_set_policing);
1593 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1594 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1595 : kbits_burst); /* Stick with user-specified value. */
1597 if (netdev_dev->cache_valid & VALID_POLICING
1598 && netdev_dev->kbits_rate == kbits_rate
1599 && netdev_dev->kbits_burst == kbits_burst) {
1600 /* Assume that settings haven't changed since we last set them. */
1604 netdev_linux_remove_policing(netdev);
1606 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1607 if (system(command) != 0) {
1608 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1612 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1613 kbits_rate, kbits_burst);
1614 if (system(command) != 0) {
1615 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1620 netdev_dev->kbits_rate = kbits_rate;
1621 netdev_dev->kbits_burst = kbits_burst;
1622 netdev_dev->cache_valid |= VALID_POLICING;
1629 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1632 const struct tc_ops **opsp;
1634 for (opsp = tcs; *opsp != NULL; opsp++) {
1635 const struct tc_ops *ops = *opsp;
1636 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1637 sset_add(types, ops->ovs_name);
1643 static const struct tc_ops *
1644 tc_lookup_ovs_name(const char *name)
1646 const struct tc_ops **opsp;
1648 for (opsp = tcs; *opsp != NULL; opsp++) {
1649 const struct tc_ops *ops = *opsp;
1650 if (!strcmp(name, ops->ovs_name)) {
1657 static const struct tc_ops *
1658 tc_lookup_linux_name(const char *name)
1660 const struct tc_ops **opsp;
1662 for (opsp = tcs; *opsp != NULL; opsp++) {
1663 const struct tc_ops *ops = *opsp;
1664 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1671 static struct tc_queue *
1672 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1675 struct netdev_dev_linux *netdev_dev =
1676 netdev_dev_linux_cast(netdev_get_dev(netdev));
1677 struct tc_queue *queue;
1679 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1680 if (queue->queue_id == queue_id) {
1687 static struct tc_queue *
1688 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1690 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1694 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1696 struct netdev_qos_capabilities *caps)
1698 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1702 caps->n_queues = ops->n_queues;
1707 netdev_linux_get_qos(const struct netdev *netdev,
1708 const char **typep, struct shash *details)
1710 struct netdev_dev_linux *netdev_dev =
1711 netdev_dev_linux_cast(netdev_get_dev(netdev));
1714 error = tc_query_qdisc(netdev);
1719 *typep = netdev_dev->tc->ops->ovs_name;
1720 return (netdev_dev->tc->ops->qdisc_get
1721 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1726 netdev_linux_set_qos(struct netdev *netdev,
1727 const char *type, const struct shash *details)
1729 struct netdev_dev_linux *netdev_dev =
1730 netdev_dev_linux_cast(netdev_get_dev(netdev));
1731 const struct tc_ops *new_ops;
1734 new_ops = tc_lookup_ovs_name(type);
1735 if (!new_ops || !new_ops->tc_install) {
1739 error = tc_query_qdisc(netdev);
1744 if (new_ops == netdev_dev->tc->ops) {
1745 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1747 /* Delete existing qdisc. */
1748 error = tc_del_qdisc(netdev);
1752 assert(netdev_dev->tc == NULL);
1754 /* Install new qdisc. */
1755 error = new_ops->tc_install(netdev, details);
1756 assert((error == 0) == (netdev_dev->tc != NULL));
1763 netdev_linux_get_queue(const struct netdev *netdev,
1764 unsigned int queue_id, struct shash *details)
1766 struct netdev_dev_linux *netdev_dev =
1767 netdev_dev_linux_cast(netdev_get_dev(netdev));
1770 error = tc_query_qdisc(netdev);
1774 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1776 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1782 netdev_linux_set_queue(struct netdev *netdev,
1783 unsigned int queue_id, const struct shash *details)
1785 struct netdev_dev_linux *netdev_dev =
1786 netdev_dev_linux_cast(netdev_get_dev(netdev));
1789 error = tc_query_qdisc(netdev);
1792 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1793 || !netdev_dev->tc->ops->class_set) {
1797 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1801 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1803 struct netdev_dev_linux *netdev_dev =
1804 netdev_dev_linux_cast(netdev_get_dev(netdev));
1807 error = tc_query_qdisc(netdev);
1810 } else if (!netdev_dev->tc->ops->class_delete) {
1813 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1815 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1821 netdev_linux_get_queue_stats(const struct netdev *netdev,
1822 unsigned int queue_id,
1823 struct netdev_queue_stats *stats)
1825 struct netdev_dev_linux *netdev_dev =
1826 netdev_dev_linux_cast(netdev_get_dev(netdev));
1829 error = tc_query_qdisc(netdev);
1832 } else if (!netdev_dev->tc->ops->class_get_stats) {
1835 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1837 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1843 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1845 struct ofpbuf request;
1846 struct tcmsg *tcmsg;
1848 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1852 tcmsg->tcm_parent = 0;
1853 nl_dump_start(dump, rtnl_sock, &request);
1854 ofpbuf_uninit(&request);
1859 netdev_linux_dump_queues(const struct netdev *netdev,
1860 netdev_dump_queues_cb *cb, void *aux)
1862 struct netdev_dev_linux *netdev_dev =
1863 netdev_dev_linux_cast(netdev_get_dev(netdev));
1864 struct tc_queue *queue;
1865 struct shash details;
1869 error = tc_query_qdisc(netdev);
1872 } else if (!netdev_dev->tc->ops->class_get) {
1877 shash_init(&details);
1878 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1879 shash_clear(&details);
1881 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1883 (*cb)(queue->queue_id, &details, aux);
1888 shash_destroy(&details);
1894 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1895 netdev_dump_queue_stats_cb *cb, void *aux)
1897 struct netdev_dev_linux *netdev_dev =
1898 netdev_dev_linux_cast(netdev_get_dev(netdev));
1899 struct nl_dump dump;
1904 error = tc_query_qdisc(netdev);
1907 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1912 if (!start_queue_dump(netdev, &dump)) {
1915 while (nl_dump_next(&dump, &msg)) {
1916 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1922 error = nl_dump_done(&dump);
1923 return error ? error : last_error;
1927 netdev_linux_get_in4(const struct netdev *netdev_,
1928 struct in_addr *address, struct in_addr *netmask)
1930 struct netdev_dev_linux *netdev_dev =
1931 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1933 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1936 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1937 SIOCGIFADDR, "SIOCGIFADDR");
1942 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1943 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1948 netdev_dev->cache_valid |= VALID_IN4;
1950 *address = netdev_dev->address;
1951 *netmask = netdev_dev->netmask;
1952 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1956 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1957 struct in_addr netmask)
1959 struct netdev_dev_linux *netdev_dev =
1960 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1963 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1965 netdev_dev->cache_valid |= VALID_IN4;
1966 netdev_dev->address = address;
1967 netdev_dev->netmask = netmask;
1968 if (address.s_addr != INADDR_ANY) {
1969 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1970 "SIOCSIFNETMASK", netmask);
1977 parse_if_inet6_line(const char *line,
1978 struct in6_addr *in6, char ifname[16 + 1])
1980 uint8_t *s6 = in6->s6_addr;
1981 #define X8 "%2"SCNx8
1983 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1984 "%*x %*x %*x %*x %16s\n",
1985 &s6[0], &s6[1], &s6[2], &s6[3],
1986 &s6[4], &s6[5], &s6[6], &s6[7],
1987 &s6[8], &s6[9], &s6[10], &s6[11],
1988 &s6[12], &s6[13], &s6[14], &s6[15],
1992 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1993 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1995 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1997 struct netdev_dev_linux *netdev_dev =
1998 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1999 if (!(netdev_dev->cache_valid & VALID_IN6)) {
2003 netdev_dev->in6 = in6addr_any;
2005 file = fopen("/proc/net/if_inet6", "r");
2007 const char *name = netdev_get_name(netdev_);
2008 while (fgets(line, sizeof line, file)) {
2009 struct in6_addr in6_tmp;
2010 char ifname[16 + 1];
2011 if (parse_if_inet6_line(line, &in6_tmp, ifname)
2012 && !strcmp(name, ifname))
2014 netdev_dev->in6 = in6_tmp;
2020 netdev_dev->cache_valid |= VALID_IN6;
2022 *in6 = netdev_dev->in6;
2027 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2029 struct sockaddr_in sin;
2030 memset(&sin, 0, sizeof sin);
2031 sin.sin_family = AF_INET;
2032 sin.sin_addr = addr;
2035 memset(sa, 0, sizeof *sa);
2036 memcpy(sa, &sin, sizeof sin);
2040 do_set_addr(struct netdev *netdev,
2041 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2044 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2045 make_in4_sockaddr(&ifr.ifr_addr, addr);
2047 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2051 /* Adds 'router' as a default IP gateway. */
2053 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2055 struct in_addr any = { INADDR_ANY };
2059 memset(&rt, 0, sizeof rt);
2060 make_in4_sockaddr(&rt.rt_dst, any);
2061 make_in4_sockaddr(&rt.rt_gateway, router);
2062 make_in4_sockaddr(&rt.rt_genmask, any);
2063 rt.rt_flags = RTF_UP | RTF_GATEWAY;
2064 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2066 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2072 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2075 static const char fn[] = "/proc/net/route";
2080 *netdev_name = NULL;
2081 stream = fopen(fn, "r");
2082 if (stream == NULL) {
2083 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2088 while (fgets(line, sizeof line, stream)) {
2091 ovs_be32 dest, gateway, mask;
2092 int refcnt, metric, mtu;
2093 unsigned int flags, use, window, irtt;
2096 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2098 iface, &dest, &gateway, &flags, &refcnt,
2099 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2101 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2105 if (!(flags & RTF_UP)) {
2106 /* Skip routes that aren't up. */
2110 /* The output of 'dest', 'mask', and 'gateway' were given in
2111 * network byte order, so we don't need need any endian
2112 * conversions here. */
2113 if ((dest & mask) == (host->s_addr & mask)) {
2115 /* The host is directly reachable. */
2116 next_hop->s_addr = 0;
2118 /* To reach the host, we must go through a gateway. */
2119 next_hop->s_addr = gateway;
2121 *netdev_name = xstrdup(iface);
2133 netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2135 struct ethtool_drvinfo drvinfo;
2138 memset(&drvinfo, 0, sizeof drvinfo);
2139 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2140 (struct ethtool_cmd *)&drvinfo,
2142 "ETHTOOL_GDRVINFO");
2144 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2145 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2146 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2152 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2153 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2154 * returns 0. Otherwise, it returns a positive errno value; in particular,
2155 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2157 netdev_linux_arp_lookup(const struct netdev *netdev,
2158 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2161 struct sockaddr_in sin;
2164 memset(&r, 0, sizeof r);
2165 memset(&sin, 0, sizeof sin);
2166 sin.sin_family = AF_INET;
2167 sin.sin_addr.s_addr = ip;
2169 memcpy(&r.arp_pa, &sin, sizeof sin);
2170 r.arp_ha.sa_family = ARPHRD_ETHER;
2172 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2173 COVERAGE_INC(netdev_arp_lookup);
2174 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2176 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2177 } else if (retval != ENXIO) {
2178 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2179 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2185 nd_to_iff_flags(enum netdev_flags nd)
2188 if (nd & NETDEV_UP) {
2191 if (nd & NETDEV_PROMISC) {
2198 iff_to_nd_flags(int iff)
2200 enum netdev_flags nd = 0;
2204 if (iff & IFF_PROMISC) {
2205 nd |= NETDEV_PROMISC;
2211 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2212 enum netdev_flags on, enum netdev_flags *old_flagsp)
2214 int old_flags, new_flags;
2217 error = get_flags(netdev, &old_flags);
2219 *old_flagsp = iff_to_nd_flags(old_flags);
2220 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2221 if (new_flags != old_flags) {
2222 error = set_flags(netdev, new_flags);
2229 netdev_linux_change_seq(const struct netdev *netdev)
2231 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2234 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2238 netdev_linux_init, \
2240 netdev_linux_wait, \
2243 netdev_linux_destroy, \
2244 NULL, /* get_config */ \
2245 NULL, /* set_config */ \
2247 netdev_linux_open, \
2248 netdev_linux_close, \
2252 netdev_linux_listen, \
2253 netdev_linux_recv, \
2254 netdev_linux_recv_wait, \
2255 netdev_linux_drain, \
2257 netdev_linux_send, \
2258 netdev_linux_send_wait, \
2260 netdev_linux_set_etheraddr, \
2261 netdev_linux_get_etheraddr, \
2262 netdev_linux_get_mtu, \
2263 netdev_linux_get_ifindex, \
2264 netdev_linux_get_carrier, \
2265 netdev_linux_set_miimon_interval, \
2266 netdev_linux_get_stats, \
2269 netdev_linux_get_features, \
2270 netdev_linux_set_advertisements, \
2271 netdev_linux_get_vlan_vid, \
2273 netdev_linux_set_policing, \
2274 netdev_linux_get_qos_types, \
2275 netdev_linux_get_qos_capabilities, \
2276 netdev_linux_get_qos, \
2277 netdev_linux_set_qos, \
2278 netdev_linux_get_queue, \
2279 netdev_linux_set_queue, \
2280 netdev_linux_delete_queue, \
2281 netdev_linux_get_queue_stats, \
2282 netdev_linux_dump_queues, \
2283 netdev_linux_dump_queue_stats, \
2285 netdev_linux_get_in4, \
2286 netdev_linux_set_in4, \
2287 netdev_linux_get_in6, \
2288 netdev_linux_add_router, \
2289 netdev_linux_get_next_hop, \
2290 netdev_linux_get_status, \
2291 netdev_linux_arp_lookup, \
2293 netdev_linux_update_flags, \
2295 netdev_linux_change_seq \
2298 const struct netdev_class netdev_linux_class =
2301 netdev_linux_create,
2302 netdev_linux_enumerate,
2303 NULL); /* set_stats */
2305 const struct netdev_class netdev_tap_class =
2308 netdev_linux_create_tap,
2309 NULL, /* enumerate */
2310 NULL); /* set_stats */
2312 const struct netdev_class netdev_internal_class =
2315 netdev_linux_create,
2316 NULL, /* enumerate */
2317 netdev_vport_set_stats);
2319 /* HTB traffic control class. */
2321 #define HTB_N_QUEUES 0xf000
2325 unsigned int max_rate; /* In bytes/s. */
2329 struct tc_queue tc_queue;
2330 unsigned int min_rate; /* In bytes/s. */
2331 unsigned int max_rate; /* In bytes/s. */
2332 unsigned int burst; /* In bytes. */
2333 unsigned int priority; /* Lower values are higher priorities. */
2337 htb_get__(const struct netdev *netdev)
2339 struct netdev_dev_linux *netdev_dev =
2340 netdev_dev_linux_cast(netdev_get_dev(netdev));
2341 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2345 htb_install__(struct netdev *netdev, uint64_t max_rate)
2347 struct netdev_dev_linux *netdev_dev =
2348 netdev_dev_linux_cast(netdev_get_dev(netdev));
2351 htb = xmalloc(sizeof *htb);
2352 tc_init(&htb->tc, &tc_ops_htb);
2353 htb->max_rate = max_rate;
2355 netdev_dev->tc = &htb->tc;
2358 /* Create an HTB qdisc.
2360 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2362 htb_setup_qdisc__(struct netdev *netdev)
2365 struct tc_htb_glob opt;
2366 struct ofpbuf request;
2367 struct tcmsg *tcmsg;
2369 tc_del_qdisc(netdev);
2371 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2372 NLM_F_EXCL | NLM_F_CREATE, &request);
2376 tcmsg->tcm_handle = tc_make_handle(1, 0);
2377 tcmsg->tcm_parent = TC_H_ROOT;
2379 nl_msg_put_string(&request, TCA_KIND, "htb");
2381 memset(&opt, 0, sizeof opt);
2382 opt.rate2quantum = 10;
2386 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2387 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2388 nl_msg_end_nested(&request, opt_offset);
2390 return tc_transact(&request, NULL);
2393 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2394 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2396 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2397 unsigned int parent, struct htb_class *class)
2400 struct tc_htb_opt opt;
2401 struct ofpbuf request;
2402 struct tcmsg *tcmsg;
2406 netdev_get_mtu(netdev, &mtu);
2407 if (mtu == INT_MAX) {
2408 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2409 netdev_get_name(netdev));
2413 memset(&opt, 0, sizeof opt);
2414 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2415 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2416 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2417 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2418 opt.prio = class->priority;
2420 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2424 tcmsg->tcm_handle = handle;
2425 tcmsg->tcm_parent = parent;
2427 nl_msg_put_string(&request, TCA_KIND, "htb");
2428 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2429 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2430 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2431 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2432 nl_msg_end_nested(&request, opt_offset);
2434 error = tc_transact(&request, NULL);
2436 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2437 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2438 netdev_get_name(netdev),
2439 tc_get_major(handle), tc_get_minor(handle),
2440 tc_get_major(parent), tc_get_minor(parent),
2441 class->min_rate, class->max_rate,
2442 class->burst, class->priority, strerror(error));
2447 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2448 * description of them into 'details'. The description complies with the
2449 * specification given in the vswitch database documentation for linux-htb
2452 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2454 static const struct nl_policy tca_htb_policy[] = {
2455 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2456 .min_len = sizeof(struct tc_htb_opt) },
2459 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2460 const struct tc_htb_opt *htb;
2462 if (!nl_parse_nested(nl_options, tca_htb_policy,
2463 attrs, ARRAY_SIZE(tca_htb_policy))) {
2464 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2468 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2469 class->min_rate = htb->rate.rate;
2470 class->max_rate = htb->ceil.rate;
2471 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2472 class->priority = htb->prio;
2477 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2478 struct htb_class *options,
2479 struct netdev_queue_stats *stats)
2481 struct nlattr *nl_options;
2482 unsigned int handle;
2485 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2486 if (!error && queue_id) {
2487 unsigned int major = tc_get_major(handle);
2488 unsigned int minor = tc_get_minor(handle);
2489 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2490 *queue_id = minor - 1;
2495 if (!error && options) {
2496 error = htb_parse_tca_options__(nl_options, options);
2502 htb_parse_qdisc_details__(struct netdev *netdev,
2503 const struct shash *details, struct htb_class *hc)
2505 const char *max_rate_s;
2507 max_rate_s = shash_find_data(details, "max-rate");
2508 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2509 if (!hc->max_rate) {
2512 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2513 hc->max_rate = netdev_features_to_bps(current) / 8;
2515 hc->min_rate = hc->max_rate;
2521 htb_parse_class_details__(struct netdev *netdev,
2522 const struct shash *details, struct htb_class *hc)
2524 const struct htb *htb = htb_get__(netdev);
2525 const char *min_rate_s = shash_find_data(details, "min-rate");
2526 const char *max_rate_s = shash_find_data(details, "max-rate");
2527 const char *burst_s = shash_find_data(details, "burst");
2528 const char *priority_s = shash_find_data(details, "priority");
2531 netdev_get_mtu(netdev, &mtu);
2532 if (mtu == INT_MAX) {
2533 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2534 netdev_get_name(netdev));
2538 /* HTB requires at least an mtu sized min-rate to send any traffic even
2539 * on uncongested links. */
2540 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2541 hc->min_rate = MAX(hc->min_rate, mtu);
2542 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2545 hc->max_rate = (max_rate_s
2546 ? strtoull(max_rate_s, NULL, 10) / 8
2548 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2549 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2553 * According to hints in the documentation that I've read, it is important
2554 * that 'burst' be at least as big as the largest frame that might be
2555 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2556 * but having it a bit too small is a problem. Since netdev_get_mtu()
2557 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2558 * the MTU. We actually add 64, instead of 14, as a guard against
2559 * additional headers get tacked on somewhere that we're not aware of. */
2560 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2561 hc->burst = MAX(hc->burst, mtu + 64);
2564 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2570 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2571 unsigned int parent, struct htb_class *options,
2572 struct netdev_queue_stats *stats)
2574 struct ofpbuf *reply;
2577 error = tc_query_class(netdev, handle, parent, &reply);
2579 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2580 ofpbuf_delete(reply);
2586 htb_tc_install(struct netdev *netdev, const struct shash *details)
2590 error = htb_setup_qdisc__(netdev);
2592 struct htb_class hc;
2594 htb_parse_qdisc_details__(netdev, details, &hc);
2595 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2596 tc_make_handle(1, 0), &hc);
2598 htb_install__(netdev, hc.max_rate);
2604 static struct htb_class *
2605 htb_class_cast__(const struct tc_queue *queue)
2607 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2611 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2612 const struct htb_class *hc)
2614 struct htb *htb = htb_get__(netdev);
2615 size_t hash = hash_int(queue_id, 0);
2616 struct tc_queue *queue;
2617 struct htb_class *hcp;
2619 queue = tc_find_queue__(netdev, queue_id, hash);
2621 hcp = htb_class_cast__(queue);
2623 hcp = xmalloc(sizeof *hcp);
2624 queue = &hcp->tc_queue;
2625 queue->queue_id = queue_id;
2626 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2629 hcp->min_rate = hc->min_rate;
2630 hcp->max_rate = hc->max_rate;
2631 hcp->burst = hc->burst;
2632 hcp->priority = hc->priority;
2636 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2639 struct nl_dump dump;
2640 struct htb_class hc;
2642 /* Get qdisc options. */
2644 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2645 htb_install__(netdev, hc.max_rate);
2648 if (!start_queue_dump(netdev, &dump)) {
2651 while (nl_dump_next(&dump, &msg)) {
2652 unsigned int queue_id;
2654 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2655 htb_update_queue__(netdev, queue_id, &hc);
2658 nl_dump_done(&dump);
2664 htb_tc_destroy(struct tc *tc)
2666 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2667 struct htb_class *hc, *next;
2669 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2670 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2678 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2680 const struct htb *htb = htb_get__(netdev);
2681 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2686 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2688 struct htb_class hc;
2691 htb_parse_qdisc_details__(netdev, details, &hc);
2692 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2693 tc_make_handle(1, 0), &hc);
2695 htb_get__(netdev)->max_rate = hc.max_rate;
2701 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2702 const struct tc_queue *queue, struct shash *details)
2704 const struct htb_class *hc = htb_class_cast__(queue);
2706 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2707 if (hc->min_rate != hc->max_rate) {
2708 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2710 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2712 shash_add(details, "priority", xasprintf("%u", hc->priority));
2718 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2719 const struct shash *details)
2721 struct htb_class hc;
2724 error = htb_parse_class_details__(netdev, details, &hc);
2729 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2730 tc_make_handle(1, 0xfffe), &hc);
2735 htb_update_queue__(netdev, queue_id, &hc);
2740 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2742 struct htb_class *hc = htb_class_cast__(queue);
2743 struct htb *htb = htb_get__(netdev);
2746 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2748 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2755 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2756 struct netdev_queue_stats *stats)
2758 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2759 tc_make_handle(1, 0xfffe), NULL, stats);
2763 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2764 const struct ofpbuf *nlmsg,
2765 netdev_dump_queue_stats_cb *cb, void *aux)
2767 struct netdev_queue_stats stats;
2768 unsigned int handle, major, minor;
2771 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2776 major = tc_get_major(handle);
2777 minor = tc_get_minor(handle);
2778 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2779 (*cb)(minor - 1, &stats, aux);
2784 static const struct tc_ops tc_ops_htb = {
2785 "htb", /* linux_name */
2786 "linux-htb", /* ovs_name */
2787 HTB_N_QUEUES, /* n_queues */
2796 htb_class_get_stats,
2797 htb_class_dump_stats
2800 /* "linux-hfsc" traffic control class. */
2802 #define HFSC_N_QUEUES 0xf000
2810 struct tc_queue tc_queue;
2815 static struct hfsc *
2816 hfsc_get__(const struct netdev *netdev)
2818 struct netdev_dev_linux *netdev_dev;
2819 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2820 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2823 static struct hfsc_class *
2824 hfsc_class_cast__(const struct tc_queue *queue)
2826 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2830 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2832 struct netdev_dev_linux * netdev_dev;
2835 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2836 hfsc = xmalloc(sizeof *hfsc);
2837 tc_init(&hfsc->tc, &tc_ops_hfsc);
2838 hfsc->max_rate = max_rate;
2839 netdev_dev->tc = &hfsc->tc;
2843 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2844 const struct hfsc_class *hc)
2848 struct hfsc_class *hcp;
2849 struct tc_queue *queue;
2851 hfsc = hfsc_get__(netdev);
2852 hash = hash_int(queue_id, 0);
2854 queue = tc_find_queue__(netdev, queue_id, hash);
2856 hcp = hfsc_class_cast__(queue);
2858 hcp = xmalloc(sizeof *hcp);
2859 queue = &hcp->tc_queue;
2860 queue->queue_id = queue_id;
2861 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2864 hcp->min_rate = hc->min_rate;
2865 hcp->max_rate = hc->max_rate;
2869 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2871 const struct tc_service_curve *rsc, *fsc, *usc;
2872 static const struct nl_policy tca_hfsc_policy[] = {
2874 .type = NL_A_UNSPEC,
2876 .min_len = sizeof(struct tc_service_curve),
2879 .type = NL_A_UNSPEC,
2881 .min_len = sizeof(struct tc_service_curve),
2884 .type = NL_A_UNSPEC,
2886 .min_len = sizeof(struct tc_service_curve),
2889 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2891 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2892 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2893 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2897 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2898 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2899 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2901 if (rsc->m1 != 0 || rsc->d != 0 ||
2902 fsc->m1 != 0 || fsc->d != 0 ||
2903 usc->m1 != 0 || usc->d != 0) {
2904 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2905 "Non-linear service curves are not supported.");
2909 if (rsc->m2 != fsc->m2) {
2910 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2911 "Real-time service curves are not supported ");
2915 if (rsc->m2 > usc->m2) {
2916 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2917 "Min-rate service curve is greater than "
2918 "the max-rate service curve.");
2922 class->min_rate = fsc->m2;
2923 class->max_rate = usc->m2;
2928 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2929 struct hfsc_class *options,
2930 struct netdev_queue_stats *stats)
2933 unsigned int handle;
2934 struct nlattr *nl_options;
2936 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2942 unsigned int major, minor;
2944 major = tc_get_major(handle);
2945 minor = tc_get_minor(handle);
2946 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2947 *queue_id = minor - 1;
2954 error = hfsc_parse_tca_options__(nl_options, options);
2961 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2962 unsigned int parent, struct hfsc_class *options,
2963 struct netdev_queue_stats *stats)
2966 struct ofpbuf *reply;
2968 error = tc_query_class(netdev, handle, parent, &reply);
2973 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2974 ofpbuf_delete(reply);
2979 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2980 struct hfsc_class *class)
2983 const char *max_rate_s;
2985 max_rate_s = shash_find_data(details, "max-rate");
2986 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2991 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2992 max_rate = netdev_features_to_bps(current) / 8;
2995 class->min_rate = max_rate;
2996 class->max_rate = max_rate;
3000 hfsc_parse_class_details__(struct netdev *netdev,
3001 const struct shash *details,
3002 struct hfsc_class * class)
3004 const struct hfsc *hfsc;
3005 uint32_t min_rate, max_rate;
3006 const char *min_rate_s, *max_rate_s;
3008 hfsc = hfsc_get__(netdev);
3009 min_rate_s = shash_find_data(details, "min-rate");
3010 max_rate_s = shash_find_data(details, "max-rate");
3012 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3013 min_rate = MAX(min_rate, 1);
3014 min_rate = MIN(min_rate, hfsc->max_rate);
3016 max_rate = (max_rate_s
3017 ? strtoull(max_rate_s, NULL, 10) / 8
3019 max_rate = MAX(max_rate, min_rate);
3020 max_rate = MIN(max_rate, hfsc->max_rate);
3022 class->min_rate = min_rate;
3023 class->max_rate = max_rate;
3028 /* Create an HFSC qdisc.
3030 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3032 hfsc_setup_qdisc__(struct netdev * netdev)
3034 struct tcmsg *tcmsg;
3035 struct ofpbuf request;
3036 struct tc_hfsc_qopt opt;
3038 tc_del_qdisc(netdev);
3040 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3041 NLM_F_EXCL | NLM_F_CREATE, &request);
3047 tcmsg->tcm_handle = tc_make_handle(1, 0);
3048 tcmsg->tcm_parent = TC_H_ROOT;
3050 memset(&opt, 0, sizeof opt);
3053 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3054 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3056 return tc_transact(&request, NULL);
3059 /* Create an HFSC class.
3061 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3062 * sc rate <min_rate> ul rate <max_rate>" */
3064 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3065 unsigned int parent, struct hfsc_class *class)
3069 struct tcmsg *tcmsg;
3070 struct ofpbuf request;
3071 struct tc_service_curve min, max;
3073 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3079 tcmsg->tcm_handle = handle;
3080 tcmsg->tcm_parent = parent;
3084 min.m2 = class->min_rate;
3088 max.m2 = class->max_rate;
3090 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3091 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3092 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3093 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3094 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3095 nl_msg_end_nested(&request, opt_offset);
3097 error = tc_transact(&request, NULL);
3099 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3100 "min-rate %ubps, max-rate %ubps (%s)",
3101 netdev_get_name(netdev),
3102 tc_get_major(handle), tc_get_minor(handle),
3103 tc_get_major(parent), tc_get_minor(parent),
3104 class->min_rate, class->max_rate, strerror(error));
3111 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3114 struct hfsc_class class;
3116 error = hfsc_setup_qdisc__(netdev);
3122 hfsc_parse_qdisc_details__(netdev, details, &class);
3123 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3124 tc_make_handle(1, 0), &class);
3130 hfsc_install__(netdev, class.max_rate);
3135 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3138 struct nl_dump dump;
3139 struct hfsc_class hc;
3142 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3143 hfsc_install__(netdev, hc.max_rate);
3145 if (!start_queue_dump(netdev, &dump)) {
3149 while (nl_dump_next(&dump, &msg)) {
3150 unsigned int queue_id;
3152 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3153 hfsc_update_queue__(netdev, queue_id, &hc);
3157 nl_dump_done(&dump);
3162 hfsc_tc_destroy(struct tc *tc)
3165 struct hfsc_class *hc, *next;
3167 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3169 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3170 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3179 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3181 const struct hfsc *hfsc;
3182 hfsc = hfsc_get__(netdev);
3183 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3188 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3191 struct hfsc_class class;
3193 hfsc_parse_qdisc_details__(netdev, details, &class);
3194 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3195 tc_make_handle(1, 0), &class);
3198 hfsc_get__(netdev)->max_rate = class.max_rate;
3205 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3206 const struct tc_queue *queue, struct shash *details)
3208 const struct hfsc_class *hc;
3210 hc = hfsc_class_cast__(queue);
3211 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3212 if (hc->min_rate != hc->max_rate) {
3213 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3219 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3220 const struct shash *details)
3223 struct hfsc_class class;
3225 error = hfsc_parse_class_details__(netdev, details, &class);
3230 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3231 tc_make_handle(1, 0xfffe), &class);
3236 hfsc_update_queue__(netdev, queue_id, &class);
3241 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3245 struct hfsc_class *hc;
3247 hc = hfsc_class_cast__(queue);
3248 hfsc = hfsc_get__(netdev);
3250 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3252 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3259 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3260 struct netdev_queue_stats *stats)
3262 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3263 tc_make_handle(1, 0xfffe), NULL, stats);
3267 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3268 const struct ofpbuf *nlmsg,
3269 netdev_dump_queue_stats_cb *cb, void *aux)
3271 struct netdev_queue_stats stats;
3272 unsigned int handle, major, minor;
3275 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3280 major = tc_get_major(handle);
3281 minor = tc_get_minor(handle);
3282 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3283 (*cb)(minor - 1, &stats, aux);
3288 static const struct tc_ops tc_ops_hfsc = {
3289 "hfsc", /* linux_name */
3290 "linux-hfsc", /* ovs_name */
3291 HFSC_N_QUEUES, /* n_queues */
3292 hfsc_tc_install, /* tc_install */
3293 hfsc_tc_load, /* tc_load */
3294 hfsc_tc_destroy, /* tc_destroy */
3295 hfsc_qdisc_get, /* qdisc_get */
3296 hfsc_qdisc_set, /* qdisc_set */
3297 hfsc_class_get, /* class_get */
3298 hfsc_class_set, /* class_set */
3299 hfsc_class_delete, /* class_delete */
3300 hfsc_class_get_stats, /* class_get_stats */
3301 hfsc_class_dump_stats /* class_dump_stats */
3304 /* "linux-default" traffic control class.
3306 * This class represents the default, unnamed Linux qdisc. It corresponds to
3307 * the "" (empty string) QoS type in the OVS database. */
3310 default_install__(struct netdev *netdev)
3312 struct netdev_dev_linux *netdev_dev =
3313 netdev_dev_linux_cast(netdev_get_dev(netdev));
3314 static struct tc *tc;
3317 tc = xmalloc(sizeof *tc);
3318 tc_init(tc, &tc_ops_default);
3320 netdev_dev->tc = tc;
3324 default_tc_install(struct netdev *netdev,
3325 const struct shash *details OVS_UNUSED)
3327 default_install__(netdev);
3332 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3334 default_install__(netdev);
3338 static const struct tc_ops tc_ops_default = {
3339 NULL, /* linux_name */
3344 NULL, /* tc_destroy */
3345 NULL, /* qdisc_get */
3346 NULL, /* qdisc_set */
3347 NULL, /* class_get */
3348 NULL, /* class_set */
3349 NULL, /* class_delete */
3350 NULL, /* class_get_stats */
3351 NULL /* class_dump_stats */
3354 /* "linux-other" traffic control class.
3359 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3361 struct netdev_dev_linux *netdev_dev =
3362 netdev_dev_linux_cast(netdev_get_dev(netdev));
3363 static struct tc *tc;
3366 tc = xmalloc(sizeof *tc);
3367 tc_init(tc, &tc_ops_other);
3369 netdev_dev->tc = tc;
3373 static const struct tc_ops tc_ops_other = {
3374 NULL, /* linux_name */
3375 "linux-other", /* ovs_name */
3377 NULL, /* tc_install */
3379 NULL, /* tc_destroy */
3380 NULL, /* qdisc_get */
3381 NULL, /* qdisc_set */
3382 NULL, /* class_get */
3383 NULL, /* class_set */
3384 NULL, /* class_delete */
3385 NULL, /* class_get_stats */
3386 NULL /* class_dump_stats */
3389 /* Traffic control. */
3391 /* Number of kernel "tc" ticks per second. */
3392 static double ticks_per_s;
3394 /* Number of kernel "jiffies" per second. This is used for the purpose of
3395 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3396 * one jiffy's worth of data.
3398 * There are two possibilities here:
3400 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3401 * approximate range of 100 to 1024. That means that we really need to
3402 * make sure that the qdisc can buffer that much data.
3404 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3405 * has finely granular timers and there's no need to fudge additional room
3406 * for buffers. (There's no extra effort needed to implement that: the
3407 * large 'buffer_hz' is used as a divisor, so practically any number will
3408 * come out as 0 in the division. Small integer results in the case of
3409 * really high dividends won't have any real effect anyhow.)
3411 static unsigned int buffer_hz;
3413 /* Returns tc handle 'major':'minor'. */
3415 tc_make_handle(unsigned int major, unsigned int minor)
3417 return TC_H_MAKE(major << 16, minor);
3420 /* Returns the major number from 'handle'. */
3422 tc_get_major(unsigned int handle)
3424 return TC_H_MAJ(handle) >> 16;
3427 /* Returns the minor number from 'handle'. */
3429 tc_get_minor(unsigned int handle)
3431 return TC_H_MIN(handle);
3434 static struct tcmsg *
3435 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3436 struct ofpbuf *request)
3438 struct tcmsg *tcmsg;
3442 error = get_ifindex(netdev, &ifindex);
3447 ofpbuf_init(request, 512);
3448 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3449 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3450 tcmsg->tcm_family = AF_UNSPEC;
3451 tcmsg->tcm_ifindex = ifindex;
3452 /* Caller should fill in tcmsg->tcm_handle. */
3453 /* Caller should fill in tcmsg->tcm_parent. */
3459 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3461 int error = nl_sock_transact(rtnl_sock, request, replyp);
3462 ofpbuf_uninit(request);
3469 /* The values in psched are not individually very meaningful, but they are
3470 * important. The tables below show some values seen in the wild.
3474 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3475 * (Before that, there are hints that it was 1000000000.)
3477 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3481 * -----------------------------------
3482 * [1] 000c8000 000f4240 000f4240 00000064
3483 * [2] 000003e8 00000400 000f4240 3b9aca00
3484 * [3] 000003e8 00000400 000f4240 3b9aca00
3485 * [4] 000003e8 00000400 000f4240 00000064
3486 * [5] 000003e8 00000040 000f4240 3b9aca00
3487 * [6] 000003e8 00000040 000f4240 000000f9
3489 * a b c d ticks_per_s buffer_hz
3490 * ------- --------- ---------- ------------- ----------- -------------
3491 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3492 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3493 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3494 * [4] 1,000 1,024 1,000,000 100 976,562 100
3495 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3496 * [6] 1,000 64 1,000,000 249 15,625,000 249
3498 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3499 * [2] 2.6.26-1-686-bigmem from Debian lenny
3500 * [3] 2.6.26-2-sparc64 from Debian lenny
3501 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3502 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3503 * [6] 2.6.34 from kernel.org on KVM
3505 static const char fn[] = "/proc/net/psched";
3506 unsigned int a, b, c, d;
3512 stream = fopen(fn, "r");
3514 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3518 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3519 VLOG_WARN("%s: read failed", fn);
3523 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3527 VLOG_WARN("%s: invalid scheduler parameters", fn);
3531 ticks_per_s = (double) a * c / b;
3535 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3538 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3541 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3542 * rate of 'rate' bytes per second. */
3544 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3549 return (rate * ticks) / ticks_per_s;
3552 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3553 * rate of 'rate' bytes per second. */
3555 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3560 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3563 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3564 * a transmission rate of 'rate' bytes per second. */
3566 tc_buffer_per_jiffy(unsigned int rate)
3571 return rate / buffer_hz;
3574 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3575 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3576 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3577 * stores NULL into it if it is absent.
3579 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3582 * Returns 0 if successful, otherwise a positive errno value. */
3584 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3585 struct nlattr **options)
3587 static const struct nl_policy tca_policy[] = {
3588 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3589 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3591 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3593 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3594 tca_policy, ta, ARRAY_SIZE(ta))) {
3595 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3600 *kind = nl_attr_get_string(ta[TCA_KIND]);
3604 *options = ta[TCA_OPTIONS];
3619 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3620 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3621 * into '*options', and its queue statistics into '*stats'. Any of the output
3622 * arguments may be null.
3624 * Returns 0 if successful, otherwise a positive errno value. */
3626 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3627 struct nlattr **options, struct netdev_queue_stats *stats)
3629 static const struct nl_policy tca_policy[] = {
3630 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3631 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3633 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3635 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3636 tca_policy, ta, ARRAY_SIZE(ta))) {
3637 VLOG_WARN_RL(&rl, "failed to parse class message");
3642 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3643 *handlep = tc->tcm_handle;
3647 *options = ta[TCA_OPTIONS];
3651 const struct gnet_stats_queue *gsq;
3652 struct gnet_stats_basic gsb;
3654 static const struct nl_policy stats_policy[] = {
3655 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3656 .min_len = sizeof gsb },
3657 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3658 .min_len = sizeof *gsq },
3660 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3662 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3663 sa, ARRAY_SIZE(sa))) {
3664 VLOG_WARN_RL(&rl, "failed to parse class stats");
3668 /* Alignment issues screw up the length of struct gnet_stats_basic on
3669 * some arch/bitsize combinations. Newer versions of Linux have a
3670 * struct gnet_stats_basic_packed, but we can't depend on that. The
3671 * easiest thing to do is just to make a copy. */
3672 memset(&gsb, 0, sizeof gsb);
3673 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3674 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3675 stats->tx_bytes = gsb.bytes;
3676 stats->tx_packets = gsb.packets;
3678 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3679 stats->tx_errors = gsq->drops;
3689 memset(stats, 0, sizeof *stats);
3694 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3697 tc_query_class(const struct netdev *netdev,
3698 unsigned int handle, unsigned int parent,
3699 struct ofpbuf **replyp)
3701 struct ofpbuf request;
3702 struct tcmsg *tcmsg;
3705 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3709 tcmsg->tcm_handle = handle;
3710 tcmsg->tcm_parent = parent;
3712 error = tc_transact(&request, replyp);
3714 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3715 netdev_get_name(netdev),
3716 tc_get_major(handle), tc_get_minor(handle),
3717 tc_get_major(parent), tc_get_minor(parent),
3723 /* Equivalent to "tc class del dev <name> handle <handle>". */
3725 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3727 struct ofpbuf request;
3728 struct tcmsg *tcmsg;
3731 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3735 tcmsg->tcm_handle = handle;
3736 tcmsg->tcm_parent = 0;
3738 error = tc_transact(&request, NULL);
3740 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3741 netdev_get_name(netdev),
3742 tc_get_major(handle), tc_get_minor(handle),
3748 /* Equivalent to "tc qdisc del dev <name> root". */
3750 tc_del_qdisc(struct netdev *netdev)
3752 struct netdev_dev_linux *netdev_dev =
3753 netdev_dev_linux_cast(netdev_get_dev(netdev));
3754 struct ofpbuf request;
3755 struct tcmsg *tcmsg;
3758 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3762 tcmsg->tcm_handle = tc_make_handle(1, 0);
3763 tcmsg->tcm_parent = TC_H_ROOT;
3765 error = tc_transact(&request, NULL);
3766 if (error == EINVAL) {
3767 /* EINVAL probably means that the default qdisc was in use, in which
3768 * case we've accomplished our purpose. */
3771 if (!error && netdev_dev->tc) {
3772 if (netdev_dev->tc->ops->tc_destroy) {
3773 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3775 netdev_dev->tc = NULL;
3780 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3781 * kernel to determine what they are. Returns 0 if successful, otherwise a
3782 * positive errno value. */
3784 tc_query_qdisc(const struct netdev *netdev)
3786 struct netdev_dev_linux *netdev_dev =
3787 netdev_dev_linux_cast(netdev_get_dev(netdev));
3788 struct ofpbuf request, *qdisc;
3789 const struct tc_ops *ops;
3790 struct tcmsg *tcmsg;
3794 if (netdev_dev->tc) {
3798 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3799 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3800 * 2.6.35 without that fix backported to it.
3802 * To avoid the OOPS, we must not make a request that would attempt to dump
3803 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3804 * few others. There are a few ways that I can see to do this, but most of
3805 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3806 * technique chosen here is to assume that any non-default qdisc that we
3807 * create will have a class with handle 1:0. The built-in qdiscs only have
3808 * a class with handle 0:0.
3810 * We could check for Linux 2.6.35+ and use a more straightforward method
3812 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3816 tcmsg->tcm_handle = tc_make_handle(1, 0);
3817 tcmsg->tcm_parent = 0;
3819 /* Figure out what tc class to instantiate. */
3820 error = tc_transact(&request, &qdisc);
3824 error = tc_parse_qdisc(qdisc, &kind, NULL);
3826 ops = &tc_ops_other;
3828 ops = tc_lookup_linux_name(kind);
3830 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3831 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3833 ops = &tc_ops_other;
3836 } else if (error == ENOENT) {
3837 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3838 * other entity that doesn't have a handle 1:0. We will assume
3839 * that it's the system default qdisc. */
3840 ops = &tc_ops_default;
3843 /* Who knows? Maybe the device got deleted. */
3844 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3845 netdev_get_name(netdev), strerror(error));
3846 ops = &tc_ops_other;
3849 /* Instantiate it. */
3850 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3851 assert((load_error == 0) == (netdev_dev->tc != NULL));
3852 ofpbuf_delete(qdisc);
3854 return error ? error : load_error;
3857 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3858 approximate the time to transmit packets of various lengths. For an MTU of
3859 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3860 represents two possible packet lengths; for a MTU of 513 through 1024, four
3861 possible lengths; and so on.
3863 Returns, for the specified 'mtu', the number of bits that packet lengths
3864 need to be shifted right to fit within such a 256-entry table. */
3866 tc_calc_cell_log(unsigned int mtu)
3871 mtu = ETH_PAYLOAD_MAX;
3873 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3875 for (cell_log = 0; mtu >= 256; cell_log++) {
3882 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3885 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3887 memset(rate, 0, sizeof *rate);
3888 rate->cell_log = tc_calc_cell_log(mtu);
3889 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3890 /* rate->cell_align = 0; */ /* distro headers. */
3891 rate->mpu = ETH_TOTAL_MIN;
3895 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3896 * attribute of the specified "type".
3898 * See tc_calc_cell_log() above for a description of "rtab"s. */
3900 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3905 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3906 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3907 unsigned packet_size = (i + 1) << rate->cell_log;
3908 if (packet_size < rate->mpu) {
3909 packet_size = rate->mpu;
3911 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3915 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3916 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3917 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3920 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3922 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3923 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3926 /* Public utility functions. */
3928 #define COPY_NETDEV_STATS \
3929 dst->rx_packets = src->rx_packets; \
3930 dst->tx_packets = src->tx_packets; \
3931 dst->rx_bytes = src->rx_bytes; \
3932 dst->tx_bytes = src->tx_bytes; \
3933 dst->rx_errors = src->rx_errors; \
3934 dst->tx_errors = src->tx_errors; \
3935 dst->rx_dropped = src->rx_dropped; \
3936 dst->tx_dropped = src->tx_dropped; \
3937 dst->multicast = src->multicast; \
3938 dst->collisions = src->collisions; \
3939 dst->rx_length_errors = src->rx_length_errors; \
3940 dst->rx_over_errors = src->rx_over_errors; \
3941 dst->rx_crc_errors = src->rx_crc_errors; \
3942 dst->rx_frame_errors = src->rx_frame_errors; \
3943 dst->rx_fifo_errors = src->rx_fifo_errors; \
3944 dst->rx_missed_errors = src->rx_missed_errors; \
3945 dst->tx_aborted_errors = src->tx_aborted_errors; \
3946 dst->tx_carrier_errors = src->tx_carrier_errors; \
3947 dst->tx_fifo_errors = src->tx_fifo_errors; \
3948 dst->tx_heartbeat_errors = src->tx_heartbeat_errors; \
3949 dst->tx_window_errors = src->tx_window_errors
3951 /* Copies 'src' into 'dst', performing format conversion in the process. */
3953 netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3954 const struct rtnl_link_stats *src)
3959 /* Copies 'src' into 'dst', performing format conversion in the process. */
3961 netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
3962 const struct rtnl_link_stats64 *src)
3967 /* Copies 'src' into 'dst', performing format conversion in the process. */
3969 netdev_stats_to_rtnl_link_stats64(struct rtnl_link_stats64 *dst,
3970 const struct netdev_stats *src)
3973 dst->rx_compressed = 0;
3974 dst->tx_compressed = 0;
3977 /* Utility functions. */
3980 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3982 /* Policy for RTNLGRP_LINK messages.
3984 * There are *many* more fields in these messages, but currently we only
3985 * care about these fields. */
3986 static const struct nl_policy rtnlgrp_link_policy[] = {
3987 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3988 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3989 .min_len = sizeof(struct rtnl_link_stats) },
3992 struct ofpbuf request;
3993 struct ofpbuf *reply;
3994 struct ifinfomsg *ifi;
3995 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3998 ofpbuf_init(&request, 0);
3999 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4000 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4001 ifi->ifi_family = PF_UNSPEC;
4002 ifi->ifi_index = ifindex;
4003 error = nl_sock_transact(rtnl_sock, &request, &reply);
4004 ofpbuf_uninit(&request);
4009 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4010 rtnlgrp_link_policy,
4011 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4012 ofpbuf_delete(reply);
4016 if (!attrs[IFLA_STATS]) {
4017 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4018 ofpbuf_delete(reply);
4022 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4024 ofpbuf_delete(reply);
4030 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4032 static const char fn[] = "/proc/net/dev";
4037 stream = fopen(fn, "r");
4039 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4044 while (fgets(line, sizeof line, stream)) {
4047 #define X64 "%"SCNu64
4050 X64 X64 X64 X64 X64 X64 X64 "%*u"
4051 X64 X64 X64 X64 X64 X64 X64 "%*u",
4057 &stats->rx_fifo_errors,
4058 &stats->rx_frame_errors,
4064 &stats->tx_fifo_errors,
4066 &stats->tx_carrier_errors) != 15) {
4067 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4068 } else if (!strcmp(devname, netdev_name)) {
4069 stats->rx_length_errors = UINT64_MAX;
4070 stats->rx_over_errors = UINT64_MAX;
4071 stats->rx_crc_errors = UINT64_MAX;
4072 stats->rx_missed_errors = UINT64_MAX;
4073 stats->tx_aborted_errors = UINT64_MAX;
4074 stats->tx_heartbeat_errors = UINT64_MAX;
4075 stats->tx_window_errors = UINT64_MAX;
4081 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4087 get_flags(const struct netdev *netdev, int *flags)
4092 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4094 *flags = ifr.ifr_flags;
4099 set_flags(struct netdev *netdev, int flags)
4103 ifr.ifr_flags = flags;
4104 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4109 do_get_ifindex(const char *netdev_name)
4113 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4114 COVERAGE_INC(netdev_get_ifindex);
4115 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4116 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4117 netdev_name, strerror(errno));
4120 return ifr.ifr_ifindex;
4124 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4126 struct netdev_dev_linux *netdev_dev =
4127 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4129 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4130 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4134 netdev_dev->cache_valid |= VALID_IFINDEX;
4135 netdev_dev->ifindex = ifindex;
4137 *ifindexp = netdev_dev->ifindex;
4142 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4147 memset(&ifr, 0, sizeof ifr);
4148 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4149 COVERAGE_INC(netdev_get_hwaddr);
4150 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4151 /* ENODEV probably means that a vif disappeared asynchronously and
4152 * hasn't been removed from the database yet, so reduce the log level
4153 * to INFO for that case. */
4154 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4155 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4156 netdev_name, strerror(errno));
4159 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4160 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4161 VLOG_WARN("%s device has unknown hardware address family %d",
4162 netdev_name, hwaddr_family);
4164 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4169 set_etheraddr(const char *netdev_name, int hwaddr_family,
4170 const uint8_t mac[ETH_ADDR_LEN])
4174 memset(&ifr, 0, sizeof ifr);
4175 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4176 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4177 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4178 COVERAGE_INC(netdev_set_hwaddr);
4179 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4180 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4181 netdev_name, strerror(errno));
4188 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4189 int cmd, const char *cmd_name)
4193 memset(&ifr, 0, sizeof ifr);
4194 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4195 ifr.ifr_data = (caddr_t) ecmd;
4198 COVERAGE_INC(netdev_ethtool);
4199 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4202 if (errno != EOPNOTSUPP) {
4203 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4204 "failed: %s", cmd_name, name, strerror(errno));
4206 /* The device doesn't support this operation. That's pretty
4207 * common, so there's no point in logging anything. */
4214 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4215 const char *cmd_name)
4217 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4218 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4219 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4227 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4228 int cmd, const char *cmd_name)
4233 ifr.ifr_addr.sa_family = AF_INET;
4234 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4236 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4237 *ip = sin->sin_addr;
4242 /* Returns an AF_PACKET raw socket or a negative errno value. */
4244 af_packet_sock(void)
4246 static int sock = INT_MIN;
4248 if (sock == INT_MIN) {
4249 sock = socket(AF_PACKET, SOCK_RAW, 0);
4251 set_nonblocking(sock);
4254 VLOG_ERR("failed to create packet socket: %s", strerror(errno));