2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
57 #include "openflow/openflow.h"
59 #include "poll-loop.h"
60 #include "rtnetlink.h"
61 #include "socket-util.h"
66 VLOG_DEFINE_THIS_MODULE(netdev_linux)
68 /* These were introduced in Linux 2.6.14, so they might be missing if we have
70 #ifndef ADVERTISED_Pause
71 #define ADVERTISED_Pause (1 << 13)
73 #ifndef ADVERTISED_Asym_Pause
74 #define ADVERTISED_Asym_Pause (1 << 14)
77 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
80 #define TC_RTAB_SIZE 1024
83 static struct rtnetlink_notifier netdev_linux_cache_notifier;
84 static int cache_notifier_refcount;
87 VALID_IFINDEX = 1 << 0,
88 VALID_ETHERADDR = 1 << 1,
92 VALID_CARRIER = 1 << 5,
93 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
94 VALID_POLICING = 1 << 7,
95 VALID_HAVE_VPORT_STATS = 1 << 8
103 /* Traffic control. */
105 /* An instance of a traffic control class. Always associated with a particular
108 * Each TC implementation subclasses this with whatever additional data it
111 const struct tc_ops *ops;
112 struct hmap queues; /* Contains "struct tc_queue"s.
113 * Read by generic TC layer.
114 * Written only by TC implementation. */
117 /* One traffic control queue.
119 * Each TC implementation subclasses this with whatever additional data it
122 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
123 unsigned int queue_id; /* OpenFlow queue ID. */
126 /* A particular kind of traffic control. Each implementation generally maps to
127 * one particular Linux qdisc class.
129 * The functions below return 0 if successful or a positive errno value on
130 * failure, except where otherwise noted. All of them must be provided, except
131 * where otherwise noted. */
133 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
134 * This is null for tc_ops_default and tc_ops_other, for which there are no
135 * appropriate values. */
136 const char *linux_name;
138 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
139 const char *ovs_name;
141 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
142 * queues. The queues are numbered 0 through n_queues - 1. */
143 unsigned int n_queues;
145 /* Called to install this TC class on 'netdev'. The implementation should
146 * make the Netlink calls required to set up 'netdev' with the right qdisc
147 * and configure it according to 'details'. The implementation may assume
148 * that the current qdisc is the default; that is, there is no need for it
149 * to delete the current qdisc before installing itself.
151 * The contents of 'details' should be documented as valid for 'ovs_name'
152 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
153 * (which is built as ovs-vswitchd.conf.db(8)).
155 * This function must return 0 if and only if it sets 'netdev->tc' to an
156 * initialized 'struct tc'.
158 * (This function is null for tc_ops_other, which cannot be installed. For
159 * other TC classes it should always be nonnull.) */
160 int (*tc_install)(struct netdev *netdev, const struct shash *details);
162 /* Called when the netdev code determines (through a Netlink query) that
163 * this TC class's qdisc is installed on 'netdev', but we didn't install
164 * it ourselves and so don't know any of the details.
166 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
167 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
168 * implementation should parse the other attributes of 'nlmsg' as
169 * necessary to determine its configuration. If necessary it should also
170 * use Netlink queries to determine the configuration of queues on
173 * This function must return 0 if and only if it sets 'netdev->tc' to an
174 * initialized 'struct tc'. */
175 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
177 /* Destroys the data structures allocated by the implementation as part of
178 * 'tc'. (This includes destroying 'tc->queues' by calling
181 * The implementation should not need to perform any Netlink calls. If
182 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
183 * (But it may not be desirable.)
185 * This function may be null if 'tc' is trivial. */
186 void (*tc_destroy)(struct tc *tc);
188 /* Retrieves details of 'netdev->tc' configuration into 'details'.
190 * The implementation should not need to perform any Netlink calls, because
191 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
192 * cached the configuration.
194 * The contents of 'details' should be documented as valid for 'ovs_name'
195 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
196 * (which is built as ovs-vswitchd.conf.db(8)).
198 * This function may be null if 'tc' is not configurable.
200 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
202 /* Reconfigures 'netdev->tc' according to 'details', performing any
203 * required Netlink calls to complete the reconfiguration.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function may be null if 'tc' is not configurable.
211 int (*qdisc_set)(struct netdev *, const struct shash *details);
213 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
214 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "Queue" table in
218 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
220 * The implementation should not need to perform any Netlink calls, because
221 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
222 * cached the queue configuration.
224 * This function may be null if 'tc' does not have queues ('n_queues' is
226 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
227 struct shash *details);
229 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
230 * 'details', perfoming any required Netlink calls to complete the
231 * reconfiguration. The caller ensures that 'queue_id' is less than
234 * The contents of 'details' should be documented as valid for 'ovs_name'
235 * in the "other_config" column in the "Queue" table in
236 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
238 * This function may be null if 'tc' does not have queues or its queues are
239 * not configurable. */
240 int (*class_set)(struct netdev *, unsigned int queue_id,
241 const struct shash *details);
243 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
244 * tc_queue's within 'netdev->tc->queues'.
246 * This function may be null if 'tc' does not have queues or its queues
247 * cannot be deleted. */
248 int (*class_delete)(struct netdev *, struct tc_queue *queue);
250 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
251 * 'struct tc_queue's within 'netdev->tc->queues'.
253 * On success, initializes '*stats'.
255 * This function may be null if 'tc' does not have queues or if it cannot
256 * report queue statistics. */
257 int (*class_get_stats)(const struct netdev *netdev,
258 const struct tc_queue *queue,
259 struct netdev_queue_stats *stats);
261 /* Extracts queue stats from 'nlmsg', which is a response to a
262 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
264 * This function may be null if 'tc' does not have queues or if it cannot
265 * report queue statistics. */
266 int (*class_dump_stats)(const struct netdev *netdev,
267 const struct ofpbuf *nlmsg,
268 netdev_dump_queue_stats_cb *cb, void *aux);
272 tc_init(struct tc *tc, const struct tc_ops *ops)
275 hmap_init(&tc->queues);
279 tc_destroy(struct tc *tc)
281 hmap_destroy(&tc->queues);
284 static const struct tc_ops tc_ops_htb;
285 static const struct tc_ops tc_ops_default;
286 static const struct tc_ops tc_ops_other;
288 static const struct tc_ops *tcs[] = {
289 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
290 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
291 &tc_ops_other, /* Some other qdisc. */
295 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
296 static unsigned int tc_get_major(unsigned int handle);
297 static unsigned int tc_get_minor(unsigned int handle);
299 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
300 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
301 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
303 static struct tcmsg *tc_make_request(const struct netdev *, int type,
304 unsigned int flags, struct ofpbuf *);
305 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
307 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
308 struct nlattr **options);
309 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
310 struct nlattr **options,
311 struct netdev_queue_stats *);
312 static int tc_query_class(const struct netdev *,
313 unsigned int handle, unsigned int parent,
314 struct ofpbuf **replyp);
315 static int tc_delete_class(const struct netdev *, unsigned int handle);
317 static int tc_del_qdisc(struct netdev *netdev);
318 static int tc_query_qdisc(const struct netdev *netdev);
320 static int tc_calc_cell_log(unsigned int mtu);
321 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
322 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
323 const struct tc_ratespec *rate);
324 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
326 struct netdev_dev_linux {
327 struct netdev_dev netdev_dev;
329 struct shash_node *shash_node;
330 unsigned int cache_valid;
332 /* The following are figured out "on demand" only. They are only valid
333 * when the corresponding VALID_* bit in 'cache_valid' is set. */
335 uint8_t etheraddr[ETH_ADDR_LEN];
336 struct in_addr address, netmask;
340 bool is_internal; /* Is this an openvswitch internal device? */
341 bool is_tap; /* Is this a tuntap device? */
342 uint32_t kbits_rate; /* Policing data. */
343 uint32_t kbits_burst;
344 bool have_vport_stats;
348 struct tap_state tap;
352 struct netdev_linux {
353 struct netdev netdev;
357 /* An AF_INET socket (used for ioctl operations). */
358 static int af_inet_sock = -1;
360 /* A Netlink routing socket that is not subscribed to any multicast groups. */
361 static struct nl_sock *rtnl_sock;
363 struct netdev_linux_notifier {
364 struct netdev_notifier notifier;
368 static struct shash netdev_linux_notifiers =
369 SHASH_INITIALIZER(&netdev_linux_notifiers);
370 static struct rtnetlink_notifier netdev_linux_poll_notifier;
372 /* This is set pretty low because we probably won't learn anything from the
373 * additional log messages. */
374 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
376 static int netdev_linux_init(void);
378 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
379 int cmd, const char *cmd_name);
380 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
381 const char *cmd_name);
382 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
383 int cmd, const char *cmd_name);
384 static int get_flags(const struct netdev *, int *flagsp);
385 static int set_flags(struct netdev *, int flags);
386 static int do_get_ifindex(const char *netdev_name);
387 static int get_ifindex(const struct netdev *, int *ifindexp);
388 static int do_set_addr(struct netdev *netdev,
389 int ioctl_nr, const char *ioctl_name,
390 struct in_addr addr);
391 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
392 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
393 const uint8_t[ETH_ADDR_LEN]);
394 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
395 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
398 is_netdev_linux_class(const struct netdev_class *netdev_class)
400 return netdev_class->init == netdev_linux_init;
403 static struct netdev_dev_linux *
404 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
406 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
407 assert(is_netdev_linux_class(netdev_class));
409 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
412 static struct netdev_linux *
413 netdev_linux_cast(const struct netdev *netdev)
415 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
416 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
417 assert(is_netdev_linux_class(netdev_class));
419 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
423 netdev_linux_init(void)
425 static int status = -1;
427 /* Create AF_INET socket. */
428 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
429 status = af_inet_sock >= 0 ? 0 : errno;
431 VLOG_ERR("failed to create inet socket: %s", strerror(status));
434 /* Create rtnetlink socket. */
436 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
438 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
447 netdev_linux_run(void)
449 rtnetlink_notifier_run();
453 netdev_linux_wait(void)
455 rtnetlink_notifier_wait();
459 netdev_linux_cache_cb(const struct rtnetlink_change *change,
460 void *aux OVS_UNUSED)
462 struct netdev_dev_linux *dev;
464 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
466 const struct netdev_class *netdev_class =
467 netdev_dev_get_class(base_dev);
469 if (is_netdev_linux_class(netdev_class)) {
470 dev = netdev_dev_linux_cast(base_dev);
471 dev->cache_valid = 0;
475 struct shash device_shash;
476 struct shash_node *node;
478 shash_init(&device_shash);
479 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
480 SHASH_FOR_EACH (node, &device_shash) {
482 dev->cache_valid = 0;
484 shash_destroy(&device_shash);
488 /* Creates the netdev device of 'type' with 'name'. */
490 netdev_linux_create_system(const struct netdev_class *class OVS_UNUSED,
491 const char *name, const struct shash *args,
492 struct netdev_dev **netdev_devp)
494 struct netdev_dev_linux *netdev_dev;
497 if (!shash_is_empty(args)) {
498 VLOG_WARN("%s: arguments for system devices should be empty", name);
501 if (!cache_notifier_refcount) {
502 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
503 netdev_linux_cache_cb, NULL);
508 cache_notifier_refcount++;
510 netdev_dev = xzalloc(sizeof *netdev_dev);
511 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
513 *netdev_devp = &netdev_dev->netdev_dev;
517 /* For most types of netdevs we open the device for each call of
518 * netdev_open(). However, this is not the case with tap devices,
519 * since it is only possible to open the device once. In this
520 * situation we share a single file descriptor, and consequently
521 * buffers, across all readers. Therefore once data is read it will
522 * be unavailable to other reads for tap devices. */
524 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
525 const char *name, const struct shash *args,
526 struct netdev_dev **netdev_devp)
528 struct netdev_dev_linux *netdev_dev;
529 struct tap_state *state;
530 static const char tap_dev[] = "/dev/net/tun";
534 if (!shash_is_empty(args)) {
535 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
538 netdev_dev = xzalloc(sizeof *netdev_dev);
539 state = &netdev_dev->state.tap;
541 /* Open tap device. */
542 state->fd = open(tap_dev, O_RDWR);
545 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
549 /* Create tap device. */
550 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
551 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
552 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
553 VLOG_WARN("%s: creating tap device failed: %s", name,
559 /* Make non-blocking. */
560 error = set_nonblocking(state->fd);
565 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
566 *netdev_devp = &netdev_dev->netdev_dev;
575 destroy_tap(struct netdev_dev_linux *netdev_dev)
577 struct tap_state *state = &netdev_dev->state.tap;
579 if (state->fd >= 0) {
584 /* Destroys the netdev device 'netdev_dev_'. */
586 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
588 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
589 const char *type = netdev_dev_get_type(netdev_dev_);
591 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
592 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
595 if (!strcmp(type, "system")) {
596 cache_notifier_refcount--;
598 if (!cache_notifier_refcount) {
599 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
601 } else if (!strcmp(type, "tap")) {
602 destroy_tap(netdev_dev);
609 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
610 struct netdev **netdevp)
612 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
613 struct netdev_linux *netdev;
614 enum netdev_flags flags;
617 /* Allocate network device. */
618 netdev = xzalloc(sizeof *netdev);
620 netdev_init(&netdev->netdev, netdev_dev_);
622 error = netdev_get_flags(&netdev->netdev, &flags);
623 if (error == ENODEV) {
627 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
628 !netdev_dev->state.tap.opened) {
630 /* We assume that the first user of the tap device is the primary user
631 * and give them the tap FD. Subsequent users probably just expect
632 * this to be a system device so open it normally to avoid send/receive
633 * directions appearing to be reversed. */
634 netdev->fd = netdev_dev->state.tap.fd;
635 netdev_dev->state.tap.opened = true;
636 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
637 struct sockaddr_ll sll;
641 /* Create file descriptor. */
642 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
643 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
645 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
646 if (netdev->fd < 0) {
651 /* Set non-blocking mode. */
652 error = set_nonblocking(netdev->fd);
657 /* Get ethernet device index. */
658 error = get_ifindex(&netdev->netdev, &ifindex);
663 /* Bind to specific ethernet device. */
664 memset(&sll, 0, sizeof sll);
665 sll.sll_family = AF_PACKET;
666 sll.sll_ifindex = ifindex;
668 (struct sockaddr *) &sll, sizeof sll) < 0) {
670 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
675 /* Between the socket() and bind() calls above, the socket receives all
676 * packets of the requested type on all system interfaces. We do not
677 * want to receive that data, but there is no way to avoid it. So we
678 * must now drain out the receive queue. */
679 error = drain_rcvbuf(netdev->fd);
685 *netdevp = &netdev->netdev;
689 netdev_uninit(&netdev->netdev, true);
693 /* Closes and destroys 'netdev'. */
695 netdev_linux_close(struct netdev *netdev_)
697 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
699 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
705 /* Initializes 'svec' with a list of the names of all known network devices. */
707 netdev_linux_enumerate(struct svec *svec)
709 struct if_nameindex *names;
711 names = if_nameindex();
715 for (i = 0; names[i].if_name != NULL; i++) {
716 svec_add(svec, names[i].if_name);
718 if_freenameindex(names);
721 VLOG_WARN("could not obtain list of network device names: %s",
728 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
732 if (netdev->fd < 0) {
733 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
738 ssize_t retval = read(netdev->fd, data, size);
741 } else if (errno != EINTR) {
742 if (errno != EAGAIN) {
743 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
744 strerror(errno), netdev_get_name(netdev_));
751 /* Registers with the poll loop to wake up from the next call to poll_block()
752 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
754 netdev_linux_recv_wait(struct netdev *netdev_)
756 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
757 if (netdev->fd >= 0) {
758 poll_fd_wait(netdev->fd, POLLIN);
762 /* Discards all packets waiting to be received from 'netdev'. */
764 netdev_linux_drain(struct netdev *netdev_)
766 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
767 if (netdev->fd < 0) {
769 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
771 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
772 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
776 drain_fd(netdev->fd, ifr.ifr_qlen);
779 return drain_rcvbuf(netdev->fd);
783 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
784 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
785 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
786 * the packet is too big or too small to transmit on the device.
788 * The caller retains ownership of 'buffer' in all cases.
790 * The kernel maintains a packet transmission queue, so the caller is not
791 * expected to do additional queuing of packets. */
793 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
797 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
799 if (netdev->fd < 0) {
804 ssize_t retval = write(netdev->fd, data, size);
806 /* The Linux AF_PACKET implementation never blocks waiting for room
807 * for packets, instead returning ENOBUFS. Translate this into
808 * EAGAIN for the caller. */
809 if (errno == ENOBUFS) {
811 } else if (errno == EINTR) {
813 } else if (errno != EAGAIN) {
814 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
815 netdev_get_name(netdev_), strerror(errno));
818 } else if (retval != size) {
819 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
820 "%zu) on %s", retval, size, netdev_get_name(netdev_));
828 /* Registers with the poll loop to wake up from the next call to poll_block()
829 * when the packet transmission queue has sufficient room to transmit a packet
830 * with netdev_send().
832 * The kernel maintains a packet transmission queue, so the client is not
833 * expected to do additional queuing of packets. Thus, this function is
834 * unlikely to ever be used. It is included for completeness. */
836 netdev_linux_send_wait(struct netdev *netdev_)
838 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 if (netdev->fd < 0) {
841 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
842 poll_fd_wait(netdev->fd, POLLOUT);
844 /* TAP device always accepts packets.*/
845 poll_immediate_wake();
849 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
850 * otherwise a positive errno value. */
852 netdev_linux_set_etheraddr(struct netdev *netdev_,
853 const uint8_t mac[ETH_ADDR_LEN])
855 struct netdev_dev_linux *netdev_dev =
856 netdev_dev_linux_cast(netdev_get_dev(netdev_));
859 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
860 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
861 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
863 netdev_dev->cache_valid |= VALID_ETHERADDR;
864 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
872 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
873 * free the returned buffer. */
875 netdev_linux_get_etheraddr(const struct netdev *netdev_,
876 uint8_t mac[ETH_ADDR_LEN])
878 struct netdev_dev_linux *netdev_dev =
879 netdev_dev_linux_cast(netdev_get_dev(netdev_));
880 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
881 int error = get_etheraddr(netdev_get_name(netdev_),
882 netdev_dev->etheraddr);
886 netdev_dev->cache_valid |= VALID_ETHERADDR;
888 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
892 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
893 * in bytes, not including the hardware header; thus, this is typically 1500
894 * bytes for Ethernet devices. */
896 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
898 struct netdev_dev_linux *netdev_dev =
899 netdev_dev_linux_cast(netdev_get_dev(netdev_));
900 if (!(netdev_dev->cache_valid & VALID_MTU)) {
904 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
905 SIOCGIFMTU, "SIOCGIFMTU");
909 netdev_dev->mtu = ifr.ifr_mtu;
910 netdev_dev->cache_valid |= VALID_MTU;
912 *mtup = netdev_dev->mtu;
916 /* Returns the ifindex of 'netdev', if successful, as a positive number.
917 * On failure, returns a negative errno value. */
919 netdev_linux_get_ifindex(const struct netdev *netdev)
923 error = get_ifindex(netdev, &ifindex);
924 return error ? -error : ifindex;
928 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
930 struct netdev_dev_linux *netdev_dev =
931 netdev_dev_linux_cast(netdev_get_dev(netdev_));
936 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
940 fn = xasprintf("/sys/class/net/%s/carrier",
941 netdev_get_name(netdev_));
942 fd = open(fn, O_RDONLY);
945 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
949 retval = read(fd, line, sizeof line);
952 if (error == EINVAL) {
953 /* This is the normal return value when we try to check carrier
954 * if the network device is not up. */
956 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
959 } else if (retval == 0) {
961 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
965 if (line[0] != '0' && line[0] != '1') {
967 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
971 netdev_dev->carrier = line[0] != '0';
972 netdev_dev->cache_valid |= VALID_CARRIER;
974 *carrier = netdev_dev->carrier;
985 /* Check whether we can we use RTM_GETLINK to get network device statistics.
986 * In pre-2.6.19 kernels, this was only available if wireless extensions were
989 check_for_working_netlink_stats(void)
991 /* Decide on the netdev_get_stats() implementation to use. Netlink is
992 * preferable, so if that works, we'll use it. */
993 int ifindex = do_get_ifindex("lo");
995 VLOG_WARN("failed to get ifindex for lo, "
996 "obtaining netdev stats from proc");
999 struct netdev_stats stats;
1000 int error = get_stats_via_netlink(ifindex, &stats);
1002 VLOG_DBG("obtaining netdev stats via rtnetlink");
1005 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1006 "via proc (you are probably running a pre-2.6.19 "
1007 "kernel)", strerror(error));
1013 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1015 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1017 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1018 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1019 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1021 netdev_dev->is_tap = !strcmp(type, "tap");
1022 netdev_dev->is_internal = false;
1023 if (!netdev_dev->is_tap) {
1024 struct ethtool_drvinfo drvinfo;
1027 memset(&drvinfo, 0, sizeof drvinfo);
1028 error = netdev_linux_do_ethtool(name,
1029 (struct ethtool_cmd *)&drvinfo,
1031 "ETHTOOL_GDRVINFO");
1033 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1034 netdev_dev->is_internal = true;
1038 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1043 swap_uint64(uint64_t *a, uint64_t *b)
1050 /* Retrieves current device stats for 'netdev'. */
1052 netdev_linux_get_stats(const struct netdev *netdev_,
1053 struct netdev_stats *stats)
1055 struct netdev_dev_linux *netdev_dev =
1056 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1057 static int use_netlink_stats = -1;
1060 COVERAGE_INC(netdev_get_stats);
1062 if (netdev_dev->have_vport_stats ||
1063 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1065 error = netdev_vport_get_stats(netdev_, stats);
1066 netdev_dev->have_vport_stats = !error;
1067 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1070 if (!netdev_dev->have_vport_stats) {
1071 if (use_netlink_stats < 0) {
1072 use_netlink_stats = check_for_working_netlink_stats();
1074 if (use_netlink_stats) {
1077 error = get_ifindex(netdev_, &ifindex);
1079 error = get_stats_via_netlink(ifindex, stats);
1082 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1086 /* If this port is an internal port then the transmit and receive stats
1087 * will appear to be swapped relative to the other ports since we are the
1088 * one sending the data, not a remote computer. For consistency, we swap
1089 * them back here. This does not apply if we are getting stats from the
1090 * vport layer because it always tracks stats from the perspective of the
1092 netdev_linux_update_is_pseudo(netdev_dev);
1093 if (!error && !netdev_dev->have_vport_stats &&
1094 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1095 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1096 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1097 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1098 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1099 stats->rx_length_errors = 0;
1100 stats->rx_over_errors = 0;
1101 stats->rx_crc_errors = 0;
1102 stats->rx_frame_errors = 0;
1103 stats->rx_fifo_errors = 0;
1104 stats->rx_missed_errors = 0;
1105 stats->tx_aborted_errors = 0;
1106 stats->tx_carrier_errors = 0;
1107 stats->tx_fifo_errors = 0;
1108 stats->tx_heartbeat_errors = 0;
1109 stats->tx_window_errors = 0;
1115 /* Stores the features supported by 'netdev' into each of '*current',
1116 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1117 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1118 * successful, otherwise a positive errno value. */
1120 netdev_linux_get_features(struct netdev *netdev,
1121 uint32_t *current, uint32_t *advertised,
1122 uint32_t *supported, uint32_t *peer)
1124 struct ethtool_cmd ecmd;
1127 memset(&ecmd, 0, sizeof ecmd);
1128 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1129 ETHTOOL_GSET, "ETHTOOL_GSET");
1134 /* Supported features. */
1136 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1137 *supported |= OFPPF_10MB_HD;
1139 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1140 *supported |= OFPPF_10MB_FD;
1142 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1143 *supported |= OFPPF_100MB_HD;
1145 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1146 *supported |= OFPPF_100MB_FD;
1148 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1149 *supported |= OFPPF_1GB_HD;
1151 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1152 *supported |= OFPPF_1GB_FD;
1154 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1155 *supported |= OFPPF_10GB_FD;
1157 if (ecmd.supported & SUPPORTED_TP) {
1158 *supported |= OFPPF_COPPER;
1160 if (ecmd.supported & SUPPORTED_FIBRE) {
1161 *supported |= OFPPF_FIBER;
1163 if (ecmd.supported & SUPPORTED_Autoneg) {
1164 *supported |= OFPPF_AUTONEG;
1166 if (ecmd.supported & SUPPORTED_Pause) {
1167 *supported |= OFPPF_PAUSE;
1169 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1170 *supported |= OFPPF_PAUSE_ASYM;
1173 /* Advertised features. */
1175 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1176 *advertised |= OFPPF_10MB_HD;
1178 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1179 *advertised |= OFPPF_10MB_FD;
1181 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1182 *advertised |= OFPPF_100MB_HD;
1184 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1185 *advertised |= OFPPF_100MB_FD;
1187 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1188 *advertised |= OFPPF_1GB_HD;
1190 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1191 *advertised |= OFPPF_1GB_FD;
1193 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1194 *advertised |= OFPPF_10GB_FD;
1196 if (ecmd.advertising & ADVERTISED_TP) {
1197 *advertised |= OFPPF_COPPER;
1199 if (ecmd.advertising & ADVERTISED_FIBRE) {
1200 *advertised |= OFPPF_FIBER;
1202 if (ecmd.advertising & ADVERTISED_Autoneg) {
1203 *advertised |= OFPPF_AUTONEG;
1205 if (ecmd.advertising & ADVERTISED_Pause) {
1206 *advertised |= OFPPF_PAUSE;
1208 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1209 *advertised |= OFPPF_PAUSE_ASYM;
1212 /* Current settings. */
1213 if (ecmd.speed == SPEED_10) {
1214 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1215 } else if (ecmd.speed == SPEED_100) {
1216 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1217 } else if (ecmd.speed == SPEED_1000) {
1218 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1219 } else if (ecmd.speed == SPEED_10000) {
1220 *current = OFPPF_10GB_FD;
1225 if (ecmd.port == PORT_TP) {
1226 *current |= OFPPF_COPPER;
1227 } else if (ecmd.port == PORT_FIBRE) {
1228 *current |= OFPPF_FIBER;
1232 *current |= OFPPF_AUTONEG;
1235 /* Peer advertisements. */
1236 *peer = 0; /* XXX */
1241 /* Set the features advertised by 'netdev' to 'advertise'. */
1243 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1245 struct ethtool_cmd ecmd;
1248 memset(&ecmd, 0, sizeof ecmd);
1249 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1250 ETHTOOL_GSET, "ETHTOOL_GSET");
1255 ecmd.advertising = 0;
1256 if (advertise & OFPPF_10MB_HD) {
1257 ecmd.advertising |= ADVERTISED_10baseT_Half;
1259 if (advertise & OFPPF_10MB_FD) {
1260 ecmd.advertising |= ADVERTISED_10baseT_Full;
1262 if (advertise & OFPPF_100MB_HD) {
1263 ecmd.advertising |= ADVERTISED_100baseT_Half;
1265 if (advertise & OFPPF_100MB_FD) {
1266 ecmd.advertising |= ADVERTISED_100baseT_Full;
1268 if (advertise & OFPPF_1GB_HD) {
1269 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1271 if (advertise & OFPPF_1GB_FD) {
1272 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1274 if (advertise & OFPPF_10GB_FD) {
1275 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1277 if (advertise & OFPPF_COPPER) {
1278 ecmd.advertising |= ADVERTISED_TP;
1280 if (advertise & OFPPF_FIBER) {
1281 ecmd.advertising |= ADVERTISED_FIBRE;
1283 if (advertise & OFPPF_AUTONEG) {
1284 ecmd.advertising |= ADVERTISED_Autoneg;
1286 if (advertise & OFPPF_PAUSE) {
1287 ecmd.advertising |= ADVERTISED_Pause;
1289 if (advertise & OFPPF_PAUSE_ASYM) {
1290 ecmd.advertising |= ADVERTISED_Asym_Pause;
1292 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1293 ETHTOOL_SSET, "ETHTOOL_SSET");
1296 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1297 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1298 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1299 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1300 * sets '*vlan_vid' to -1. */
1302 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1304 const char *netdev_name = netdev_get_name(netdev);
1305 struct ds line = DS_EMPTY_INITIALIZER;
1306 FILE *stream = NULL;
1310 COVERAGE_INC(netdev_get_vlan_vid);
1311 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1312 stream = fopen(fn, "r");
1318 if (ds_get_line(&line, stream)) {
1319 if (ferror(stream)) {
1321 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1324 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1329 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1331 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1332 fn, ds_cstr(&line));
1350 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1351 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1353 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1354 * positive errno value.
1356 * This function is equivalent to running
1357 * /sbin/tc qdisc del dev %s handle ffff: ingress
1358 * but it is much, much faster.
1361 netdev_linux_remove_policing(struct netdev *netdev)
1363 struct netdev_dev_linux *netdev_dev =
1364 netdev_dev_linux_cast(netdev_get_dev(netdev));
1365 const char *netdev_name = netdev_get_name(netdev);
1367 struct ofpbuf request;
1368 struct tcmsg *tcmsg;
1371 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1375 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1376 tcmsg->tcm_parent = TC_H_INGRESS;
1377 nl_msg_put_string(&request, TCA_KIND, "ingress");
1378 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1380 error = tc_transact(&request, NULL);
1381 if (error && error != ENOENT && error != EINVAL) {
1382 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1383 netdev_name, strerror(error));
1387 netdev_dev->kbits_rate = 0;
1388 netdev_dev->kbits_burst = 0;
1389 netdev_dev->cache_valid |= VALID_POLICING;
1393 /* Attempts to set input rate limiting (policing) policy. */
1395 netdev_linux_set_policing(struct netdev *netdev,
1396 uint32_t kbits_rate, uint32_t kbits_burst)
1398 struct netdev_dev_linux *netdev_dev =
1399 netdev_dev_linux_cast(netdev_get_dev(netdev));
1400 const char *netdev_name = netdev_get_name(netdev);
1403 COVERAGE_INC(netdev_set_policing);
1405 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1406 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1407 : kbits_burst); /* Stick with user-specified value. */
1409 if (netdev_dev->cache_valid & VALID_POLICING
1410 && netdev_dev->kbits_rate == kbits_rate
1411 && netdev_dev->kbits_burst == kbits_burst) {
1412 /* Assume that settings haven't changed since we last set them. */
1416 netdev_linux_remove_policing(netdev);
1418 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1419 if (system(command) != 0) {
1420 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1424 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1425 kbits_rate, kbits_burst);
1426 if (system(command) != 0) {
1427 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1432 netdev_dev->kbits_rate = kbits_rate;
1433 netdev_dev->kbits_burst = kbits_burst;
1434 netdev_dev->cache_valid |= VALID_POLICING;
1441 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1444 const struct tc_ops **opsp;
1446 for (opsp = tcs; *opsp != NULL; opsp++) {
1447 const struct tc_ops *ops = *opsp;
1448 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1449 svec_add(types, ops->ovs_name);
1455 static const struct tc_ops *
1456 tc_lookup_ovs_name(const char *name)
1458 const struct tc_ops **opsp;
1460 for (opsp = tcs; *opsp != NULL; opsp++) {
1461 const struct tc_ops *ops = *opsp;
1462 if (!strcmp(name, ops->ovs_name)) {
1469 static const struct tc_ops *
1470 tc_lookup_linux_name(const char *name)
1472 const struct tc_ops **opsp;
1474 for (opsp = tcs; *opsp != NULL; opsp++) {
1475 const struct tc_ops *ops = *opsp;
1476 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1483 static struct tc_queue *
1484 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1487 struct netdev_dev_linux *netdev_dev =
1488 netdev_dev_linux_cast(netdev_get_dev(netdev));
1489 struct tc_queue *queue;
1491 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1492 if (queue->queue_id == queue_id) {
1499 static struct tc_queue *
1500 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1502 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1506 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1508 struct netdev_qos_capabilities *caps)
1510 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1514 caps->n_queues = ops->n_queues;
1519 netdev_linux_get_qos(const struct netdev *netdev,
1520 const char **typep, struct shash *details)
1522 struct netdev_dev_linux *netdev_dev =
1523 netdev_dev_linux_cast(netdev_get_dev(netdev));
1526 error = tc_query_qdisc(netdev);
1531 *typep = netdev_dev->tc->ops->ovs_name;
1532 return (netdev_dev->tc->ops->qdisc_get
1533 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1538 netdev_linux_set_qos(struct netdev *netdev,
1539 const char *type, const struct shash *details)
1541 struct netdev_dev_linux *netdev_dev =
1542 netdev_dev_linux_cast(netdev_get_dev(netdev));
1543 const struct tc_ops *new_ops;
1546 new_ops = tc_lookup_ovs_name(type);
1547 if (!new_ops || !new_ops->tc_install) {
1551 error = tc_query_qdisc(netdev);
1556 if (new_ops == netdev_dev->tc->ops) {
1557 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1559 /* Delete existing qdisc. */
1560 error = tc_del_qdisc(netdev);
1564 assert(netdev_dev->tc == NULL);
1566 /* Install new qdisc. */
1567 error = new_ops->tc_install(netdev, details);
1568 assert((error == 0) == (netdev_dev->tc != NULL));
1575 netdev_linux_get_queue(const struct netdev *netdev,
1576 unsigned int queue_id, struct shash *details)
1578 struct netdev_dev_linux *netdev_dev =
1579 netdev_dev_linux_cast(netdev_get_dev(netdev));
1582 error = tc_query_qdisc(netdev);
1586 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1588 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1594 netdev_linux_set_queue(struct netdev *netdev,
1595 unsigned int queue_id, const struct shash *details)
1597 struct netdev_dev_linux *netdev_dev =
1598 netdev_dev_linux_cast(netdev_get_dev(netdev));
1601 error = tc_query_qdisc(netdev);
1604 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1605 || !netdev_dev->tc->ops->class_set) {
1609 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1613 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1615 struct netdev_dev_linux *netdev_dev =
1616 netdev_dev_linux_cast(netdev_get_dev(netdev));
1619 error = tc_query_qdisc(netdev);
1622 } else if (!netdev_dev->tc->ops->class_delete) {
1625 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1627 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1633 netdev_linux_get_queue_stats(const struct netdev *netdev,
1634 unsigned int queue_id,
1635 struct netdev_queue_stats *stats)
1637 struct netdev_dev_linux *netdev_dev =
1638 netdev_dev_linux_cast(netdev_get_dev(netdev));
1641 error = tc_query_qdisc(netdev);
1644 } else if (!netdev_dev->tc->ops->class_get_stats) {
1647 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1649 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1655 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1657 struct ofpbuf request;
1658 struct tcmsg *tcmsg;
1660 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1664 tcmsg->tcm_parent = 0;
1665 nl_dump_start(dump, rtnl_sock, &request);
1666 ofpbuf_uninit(&request);
1671 netdev_linux_dump_queues(const struct netdev *netdev,
1672 netdev_dump_queues_cb *cb, void *aux)
1674 struct netdev_dev_linux *netdev_dev =
1675 netdev_dev_linux_cast(netdev_get_dev(netdev));
1676 struct tc_queue *queue;
1677 struct shash details;
1681 error = tc_query_qdisc(netdev);
1684 } else if (!netdev_dev->tc->ops->class_get) {
1689 shash_init(&details);
1690 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1691 shash_clear(&details);
1693 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1695 (*cb)(queue->queue_id, &details, aux);
1700 shash_destroy(&details);
1706 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1707 netdev_dump_queue_stats_cb *cb, void *aux)
1709 struct netdev_dev_linux *netdev_dev =
1710 netdev_dev_linux_cast(netdev_get_dev(netdev));
1711 struct nl_dump dump;
1716 error = tc_query_qdisc(netdev);
1719 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1724 if (!start_queue_dump(netdev, &dump)) {
1727 while (nl_dump_next(&dump, &msg)) {
1728 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1734 error = nl_dump_done(&dump);
1735 return error ? error : last_error;
1739 netdev_linux_get_in4(const struct netdev *netdev_,
1740 struct in_addr *address, struct in_addr *netmask)
1742 struct netdev_dev_linux *netdev_dev =
1743 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1745 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1748 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1749 SIOCGIFADDR, "SIOCGIFADDR");
1754 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1755 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1760 netdev_dev->cache_valid |= VALID_IN4;
1762 *address = netdev_dev->address;
1763 *netmask = netdev_dev->netmask;
1764 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1768 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1769 struct in_addr netmask)
1771 struct netdev_dev_linux *netdev_dev =
1772 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1775 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1777 netdev_dev->cache_valid |= VALID_IN4;
1778 netdev_dev->address = address;
1779 netdev_dev->netmask = netmask;
1780 if (address.s_addr != INADDR_ANY) {
1781 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1782 "SIOCSIFNETMASK", netmask);
1789 parse_if_inet6_line(const char *line,
1790 struct in6_addr *in6, char ifname[16 + 1])
1792 uint8_t *s6 = in6->s6_addr;
1793 #define X8 "%2"SCNx8
1795 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1796 "%*x %*x %*x %*x %16s\n",
1797 &s6[0], &s6[1], &s6[2], &s6[3],
1798 &s6[4], &s6[5], &s6[6], &s6[7],
1799 &s6[8], &s6[9], &s6[10], &s6[11],
1800 &s6[12], &s6[13], &s6[14], &s6[15],
1804 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1805 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1807 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1809 struct netdev_dev_linux *netdev_dev =
1810 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1811 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1815 netdev_dev->in6 = in6addr_any;
1817 file = fopen("/proc/net/if_inet6", "r");
1819 const char *name = netdev_get_name(netdev_);
1820 while (fgets(line, sizeof line, file)) {
1821 struct in6_addr in6_tmp;
1822 char ifname[16 + 1];
1823 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1824 && !strcmp(name, ifname))
1826 netdev_dev->in6 = in6_tmp;
1832 netdev_dev->cache_valid |= VALID_IN6;
1834 *in6 = netdev_dev->in6;
1839 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1841 struct sockaddr_in sin;
1842 memset(&sin, 0, sizeof sin);
1843 sin.sin_family = AF_INET;
1844 sin.sin_addr = addr;
1847 memset(sa, 0, sizeof *sa);
1848 memcpy(sa, &sin, sizeof sin);
1852 do_set_addr(struct netdev *netdev,
1853 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1856 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1857 make_in4_sockaddr(&ifr.ifr_addr, addr);
1859 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1863 /* Adds 'router' as a default IP gateway. */
1865 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1867 struct in_addr any = { INADDR_ANY };
1871 memset(&rt, 0, sizeof rt);
1872 make_in4_sockaddr(&rt.rt_dst, any);
1873 make_in4_sockaddr(&rt.rt_gateway, router);
1874 make_in4_sockaddr(&rt.rt_genmask, any);
1875 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1876 COVERAGE_INC(netdev_add_router);
1877 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1879 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1885 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1888 static const char fn[] = "/proc/net/route";
1893 *netdev_name = NULL;
1894 stream = fopen(fn, "r");
1895 if (stream == NULL) {
1896 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1901 while (fgets(line, sizeof line, stream)) {
1904 uint32_t dest, gateway, mask;
1905 int refcnt, metric, mtu;
1906 unsigned int flags, use, window, irtt;
1909 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1911 iface, &dest, &gateway, &flags, &refcnt,
1912 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1914 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1918 if (!(flags & RTF_UP)) {
1919 /* Skip routes that aren't up. */
1923 /* The output of 'dest', 'mask', and 'gateway' were given in
1924 * network byte order, so we don't need need any endian
1925 * conversions here. */
1926 if ((dest & mask) == (host->s_addr & mask)) {
1928 /* The host is directly reachable. */
1929 next_hop->s_addr = 0;
1931 /* To reach the host, we must go through a gateway. */
1932 next_hop->s_addr = gateway;
1934 *netdev_name = xstrdup(iface);
1945 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1946 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1947 * returns 0. Otherwise, it returns a positive errno value; in particular,
1948 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1950 netdev_linux_arp_lookup(const struct netdev *netdev,
1951 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1954 struct sockaddr_in sin;
1957 memset(&r, 0, sizeof r);
1958 sin.sin_family = AF_INET;
1959 sin.sin_addr.s_addr = ip;
1961 memcpy(&r.arp_pa, &sin, sizeof sin);
1962 r.arp_ha.sa_family = ARPHRD_ETHER;
1964 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1965 COVERAGE_INC(netdev_arp_lookup);
1966 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1968 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1969 } else if (retval != ENXIO) {
1970 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1971 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1977 nd_to_iff_flags(enum netdev_flags nd)
1980 if (nd & NETDEV_UP) {
1983 if (nd & NETDEV_PROMISC) {
1990 iff_to_nd_flags(int iff)
1992 enum netdev_flags nd = 0;
1996 if (iff & IFF_PROMISC) {
1997 nd |= NETDEV_PROMISC;
2003 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2004 enum netdev_flags on, enum netdev_flags *old_flagsp)
2006 int old_flags, new_flags;
2009 error = get_flags(netdev, &old_flags);
2011 *old_flagsp = iff_to_nd_flags(old_flags);
2012 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2013 if (new_flags != old_flags) {
2014 error = set_flags(netdev, new_flags);
2021 poll_notify(struct list *list)
2023 struct netdev_linux_notifier *notifier;
2024 LIST_FOR_EACH (notifier, node, list) {
2025 struct netdev_notifier *n = ¬ifier->notifier;
2031 netdev_linux_poll_cb(const struct rtnetlink_change *change,
2032 void *aux OVS_UNUSED)
2035 struct list *list = shash_find_data(&netdev_linux_notifiers,
2041 struct shash_node *node;
2042 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2043 poll_notify(node->data);
2049 netdev_linux_poll_add(struct netdev *netdev,
2050 void (*cb)(struct netdev_notifier *), void *aux,
2051 struct netdev_notifier **notifierp)
2053 const char *netdev_name = netdev_get_name(netdev);
2054 struct netdev_linux_notifier *notifier;
2057 if (shash_is_empty(&netdev_linux_notifiers)) {
2058 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2059 netdev_linux_poll_cb, NULL);
2065 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2067 list = xmalloc(sizeof *list);
2069 shash_add(&netdev_linux_notifiers, netdev_name, list);
2072 notifier = xmalloc(sizeof *notifier);
2073 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2074 list_push_back(list, ¬ifier->node);
2075 *notifierp = ¬ifier->notifier;
2080 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2082 struct netdev_linux_notifier *notifier =
2083 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2086 /* Remove 'notifier' from its list. */
2087 list = list_remove(¬ifier->node);
2088 if (list_is_empty(list)) {
2089 /* The list is now empty. Remove it from the hash and free it. */
2090 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2091 shash_delete(&netdev_linux_notifiers,
2092 shash_find(&netdev_linux_notifiers, netdev_name));
2097 /* If that was the last notifier, unregister. */
2098 if (shash_is_empty(&netdev_linux_notifiers)) {
2099 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2103 const struct netdev_class netdev_linux_class = {
2110 netdev_linux_create_system,
2111 netdev_linux_destroy,
2112 NULL, /* reconfigure */
2117 netdev_linux_enumerate,
2120 netdev_linux_recv_wait,
2124 netdev_linux_send_wait,
2126 netdev_linux_set_etheraddr,
2127 netdev_linux_get_etheraddr,
2128 netdev_linux_get_mtu,
2129 netdev_linux_get_ifindex,
2130 netdev_linux_get_carrier,
2131 netdev_linux_get_stats,
2132 netdev_vport_set_stats,
2134 netdev_linux_get_features,
2135 netdev_linux_set_advertisements,
2136 netdev_linux_get_vlan_vid,
2138 netdev_linux_set_policing,
2139 netdev_linux_get_qos_types,
2140 netdev_linux_get_qos_capabilities,
2141 netdev_linux_get_qos,
2142 netdev_linux_set_qos,
2143 netdev_linux_get_queue,
2144 netdev_linux_set_queue,
2145 netdev_linux_delete_queue,
2146 netdev_linux_get_queue_stats,
2147 netdev_linux_dump_queues,
2148 netdev_linux_dump_queue_stats,
2150 netdev_linux_get_in4,
2151 netdev_linux_set_in4,
2152 netdev_linux_get_in6,
2153 netdev_linux_add_router,
2154 netdev_linux_get_next_hop,
2155 netdev_linux_arp_lookup,
2157 netdev_linux_update_flags,
2159 netdev_linux_poll_add,
2160 netdev_linux_poll_remove,
2163 const struct netdev_class netdev_tap_class = {
2170 netdev_linux_create_tap,
2171 netdev_linux_destroy,
2172 NULL, /* reconfigure */
2177 NULL, /* enumerate */
2180 netdev_linux_recv_wait,
2184 netdev_linux_send_wait,
2186 netdev_linux_set_etheraddr,
2187 netdev_linux_get_etheraddr,
2188 netdev_linux_get_mtu,
2189 netdev_linux_get_ifindex,
2190 netdev_linux_get_carrier,
2191 netdev_linux_get_stats,
2192 NULL, /* set_stats */
2194 netdev_linux_get_features,
2195 netdev_linux_set_advertisements,
2196 netdev_linux_get_vlan_vid,
2198 netdev_linux_set_policing,
2199 netdev_linux_get_qos_types,
2200 netdev_linux_get_qos_capabilities,
2201 netdev_linux_get_qos,
2202 netdev_linux_set_qos,
2203 netdev_linux_get_queue,
2204 netdev_linux_set_queue,
2205 netdev_linux_delete_queue,
2206 netdev_linux_get_queue_stats,
2207 netdev_linux_dump_queues,
2208 netdev_linux_dump_queue_stats,
2210 netdev_linux_get_in4,
2211 netdev_linux_set_in4,
2212 netdev_linux_get_in6,
2213 netdev_linux_add_router,
2214 netdev_linux_get_next_hop,
2215 netdev_linux_arp_lookup,
2217 netdev_linux_update_flags,
2219 netdev_linux_poll_add,
2220 netdev_linux_poll_remove,
2223 /* HTB traffic control class. */
2225 #define HTB_N_QUEUES 0xf000
2229 unsigned int max_rate; /* In bytes/s. */
2233 struct tc_queue tc_queue;
2234 unsigned int min_rate; /* In bytes/s. */
2235 unsigned int max_rate; /* In bytes/s. */
2236 unsigned int burst; /* In bytes. */
2237 unsigned int priority; /* Lower values are higher priorities. */
2241 htb_get__(const struct netdev *netdev)
2243 struct netdev_dev_linux *netdev_dev =
2244 netdev_dev_linux_cast(netdev_get_dev(netdev));
2245 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2249 htb_install__(struct netdev *netdev, uint64_t max_rate)
2251 struct netdev_dev_linux *netdev_dev =
2252 netdev_dev_linux_cast(netdev_get_dev(netdev));
2255 htb = xmalloc(sizeof *htb);
2256 tc_init(&htb->tc, &tc_ops_htb);
2257 htb->max_rate = max_rate;
2259 netdev_dev->tc = &htb->tc;
2264 /* Create an HTB qdisc.
2266 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default
2269 htb_setup_qdisc__(struct netdev *netdev)
2272 struct tc_htb_glob opt;
2273 struct ofpbuf request;
2274 struct tcmsg *tcmsg;
2276 tc_del_qdisc(netdev);
2278 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2279 NLM_F_EXCL | NLM_F_CREATE, &request);
2283 tcmsg->tcm_handle = tc_make_handle(1, 0);
2284 tcmsg->tcm_parent = TC_H_ROOT;
2286 nl_msg_put_string(&request, TCA_KIND, "htb");
2288 memset(&opt, 0, sizeof opt);
2289 opt.rate2quantum = 10;
2293 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2294 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2295 nl_msg_end_nested(&request, opt_offset);
2297 return tc_transact(&request, NULL);
2300 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2301 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2303 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2304 unsigned int parent, struct htb_class *class)
2307 struct tc_htb_opt opt;
2308 struct ofpbuf request;
2309 struct tcmsg *tcmsg;
2313 netdev_get_mtu(netdev, &mtu);
2315 memset(&opt, 0, sizeof opt);
2316 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2317 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2318 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2319 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2320 opt.prio = class->priority;
2322 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2326 tcmsg->tcm_handle = handle;
2327 tcmsg->tcm_parent = parent;
2329 nl_msg_put_string(&request, TCA_KIND, "htb");
2330 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2331 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2332 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2333 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2334 nl_msg_end_nested(&request, opt_offset);
2336 error = tc_transact(&request, NULL);
2338 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2339 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2340 netdev_get_name(netdev),
2341 tc_get_major(handle), tc_get_minor(handle),
2342 tc_get_major(parent), tc_get_minor(parent),
2343 class->min_rate, class->max_rate,
2344 class->burst, class->priority, strerror(error));
2349 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2350 * description of them into 'details'. The description complies with the
2351 * specification given in the vswitch database documentation for linux-htb
2354 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2356 static const struct nl_policy tca_htb_policy[] = {
2357 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2358 .min_len = sizeof(struct tc_htb_opt) },
2361 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2362 const struct tc_htb_opt *htb;
2364 if (!nl_parse_nested(nl_options, tca_htb_policy,
2365 attrs, ARRAY_SIZE(tca_htb_policy))) {
2366 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2370 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2371 class->min_rate = htb->rate.rate;
2372 class->max_rate = htb->ceil.rate;
2373 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2374 class->priority = htb->prio;
2379 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2380 struct htb_class *options,
2381 struct netdev_queue_stats *stats)
2383 struct nlattr *nl_options;
2384 unsigned int handle;
2387 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2388 if (!error && queue_id) {
2389 unsigned int major = tc_get_major(handle);
2390 unsigned int minor = tc_get_minor(handle);
2391 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2392 *queue_id = minor - 1;
2397 if (!error && options) {
2398 error = htb_parse_tca_options__(nl_options, options);
2404 htb_parse_qdisc_details__(struct netdev *netdev,
2405 const struct shash *details, struct htb_class *hc)
2407 const char *max_rate_s;
2409 max_rate_s = shash_find_data(details, "max-rate");
2410 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2411 if (!hc->max_rate) {
2414 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2415 hc->max_rate = netdev_features_to_bps(current) / 8;
2417 hc->min_rate = hc->max_rate;
2423 htb_parse_class_details__(struct netdev *netdev,
2424 const struct shash *details, struct htb_class *hc)
2426 const struct htb *htb = htb_get__(netdev);
2427 const char *min_rate_s = shash_find_data(details, "min-rate");
2428 const char *max_rate_s = shash_find_data(details, "max-rate");
2429 const char *burst_s = shash_find_data(details, "burst");
2430 const char *priority_s = shash_find_data(details, "priority");
2433 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2435 /* min-rate is required. */
2438 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2439 hc->min_rate = MAX(hc->min_rate, 1500);
2440 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2443 hc->max_rate = (max_rate_s
2444 ? strtoull(max_rate_s, NULL, 10) / 8
2446 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2447 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2451 * According to hints in the documentation that I've read, it is important
2452 * that 'burst' be at least as big as the largest frame that might be
2453 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2454 * but having it a bit too small is a problem. Since netdev_get_mtu()
2455 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2456 * the MTU. We actually add 64, instead of 14, as a guard against
2457 * additional headers get tacked on somewhere that we're not aware of. */
2458 netdev_get_mtu(netdev, &mtu);
2459 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2460 hc->burst = MAX(hc->burst, mtu + 64);
2463 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2469 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2470 unsigned int parent, struct htb_class *options,
2471 struct netdev_queue_stats *stats)
2473 struct ofpbuf *reply;
2476 error = tc_query_class(netdev, handle, parent, &reply);
2478 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2479 ofpbuf_delete(reply);
2485 htb_tc_install(struct netdev *netdev, const struct shash *details)
2489 error = htb_setup_qdisc__(netdev);
2491 struct htb_class hc;
2493 htb_parse_qdisc_details__(netdev, details, &hc);
2494 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2495 tc_make_handle(1, 0), &hc);
2497 htb_install__(netdev, hc.max_rate);
2503 static struct htb_class *
2504 htb_class_cast__(const struct tc_queue *queue)
2506 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2510 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2511 const struct htb_class *hc)
2513 struct htb *htb = htb_get__(netdev);
2514 size_t hash = hash_int(queue_id, 0);
2515 struct tc_queue *queue;
2516 struct htb_class *hcp;
2518 queue = tc_find_queue__(netdev, queue_id, hash);
2520 hcp = htb_class_cast__(queue);
2522 hcp = xmalloc(sizeof *hcp);
2523 queue = &hcp->tc_queue;
2524 queue->queue_id = queue_id;
2525 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2528 hcp->min_rate = hc->min_rate;
2529 hcp->max_rate = hc->max_rate;
2530 hcp->burst = hc->burst;
2531 hcp->priority = hc->priority;
2535 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2538 struct nl_dump dump;
2539 struct htb_class hc;
2542 /* Get qdisc options. */
2544 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2545 htb = htb_install__(netdev, hc.max_rate);
2548 if (!start_queue_dump(netdev, &dump)) {
2551 while (nl_dump_next(&dump, &msg)) {
2552 unsigned int queue_id;
2554 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2555 htb_update_queue__(netdev, queue_id, &hc);
2558 nl_dump_done(&dump);
2564 htb_tc_destroy(struct tc *tc)
2566 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2567 struct htb_class *hc, *next;
2569 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2570 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2578 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2580 const struct htb *htb = htb_get__(netdev);
2581 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2586 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2588 struct htb_class hc;
2591 htb_parse_qdisc_details__(netdev, details, &hc);
2592 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2593 tc_make_handle(1, 0), &hc);
2595 htb_get__(netdev)->max_rate = hc.max_rate;
2601 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2602 const struct tc_queue *queue, struct shash *details)
2604 const struct htb_class *hc = htb_class_cast__(queue);
2606 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2607 if (hc->min_rate != hc->max_rate) {
2608 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2610 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2612 shash_add(details, "priority", xasprintf("%u", hc->priority));
2618 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2619 const struct shash *details)
2621 struct htb_class hc;
2624 error = htb_parse_class_details__(netdev, details, &hc);
2629 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2630 tc_make_handle(1, 0xfffe), &hc);
2635 htb_update_queue__(netdev, queue_id, &hc);
2640 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2642 struct htb_class *hc = htb_class_cast__(queue);
2643 struct htb *htb = htb_get__(netdev);
2646 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2648 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2655 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2656 struct netdev_queue_stats *stats)
2658 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2659 tc_make_handle(1, 0xfffe), NULL, stats);
2663 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2664 const struct ofpbuf *nlmsg,
2665 netdev_dump_queue_stats_cb *cb, void *aux)
2667 struct netdev_queue_stats stats;
2668 unsigned int handle, major, minor;
2671 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2676 major = tc_get_major(handle);
2677 minor = tc_get_minor(handle);
2678 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2679 (*cb)(minor - 1, &stats, aux);
2684 static const struct tc_ops tc_ops_htb = {
2685 "htb", /* linux_name */
2686 "linux-htb", /* ovs_name */
2687 HTB_N_QUEUES, /* n_queues */
2696 htb_class_get_stats,
2697 htb_class_dump_stats
2700 /* "linux-default" traffic control class.
2702 * This class represents the default, unnamed Linux qdisc. It corresponds to
2703 * the "" (empty string) QoS type in the OVS database. */
2706 default_install__(struct netdev *netdev)
2708 struct netdev_dev_linux *netdev_dev =
2709 netdev_dev_linux_cast(netdev_get_dev(netdev));
2710 static struct tc *tc;
2713 tc = xmalloc(sizeof *tc);
2714 tc_init(tc, &tc_ops_default);
2716 netdev_dev->tc = tc;
2720 default_tc_install(struct netdev *netdev,
2721 const struct shash *details OVS_UNUSED)
2723 default_install__(netdev);
2728 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2730 default_install__(netdev);
2734 static const struct tc_ops tc_ops_default = {
2735 NULL, /* linux_name */
2740 NULL, /* tc_destroy */
2741 NULL, /* qdisc_get */
2742 NULL, /* qdisc_set */
2743 NULL, /* class_get */
2744 NULL, /* class_set */
2745 NULL, /* class_delete */
2746 NULL, /* class_get_stats */
2747 NULL /* class_dump_stats */
2750 /* "linux-other" traffic control class.
2755 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2757 struct netdev_dev_linux *netdev_dev =
2758 netdev_dev_linux_cast(netdev_get_dev(netdev));
2759 static struct tc *tc;
2762 tc = xmalloc(sizeof *tc);
2763 tc_init(tc, &tc_ops_other);
2765 netdev_dev->tc = tc;
2769 static const struct tc_ops tc_ops_other = {
2770 NULL, /* linux_name */
2771 "linux-other", /* ovs_name */
2773 NULL, /* tc_install */
2775 NULL, /* tc_destroy */
2776 NULL, /* qdisc_get */
2777 NULL, /* qdisc_set */
2778 NULL, /* class_get */
2779 NULL, /* class_set */
2780 NULL, /* class_delete */
2781 NULL, /* class_get_stats */
2782 NULL /* class_dump_stats */
2785 /* Traffic control. */
2787 /* Number of kernel "tc" ticks per second. */
2788 static double ticks_per_s;
2790 /* Number of kernel "jiffies" per second. This is used for the purpose of
2791 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
2792 * one jiffy's worth of data.
2794 * There are two possibilities here:
2796 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
2797 * approximate range of 100 to 1024. That means that we really need to
2798 * make sure that the qdisc can buffer that much data.
2800 * - 'buffer_hz' is an absurdly large number. That means that the kernel
2801 * has finely granular timers and there's no need to fudge additional room
2802 * for buffers. (There's no extra effort needed to implement that: the
2803 * large 'buffer_hz' is used as a divisor, so practically any number will
2804 * come out as 0 in the division. Small integer results in the case of
2805 * really high dividends won't have any real effect anyhow.)
2807 static unsigned int buffer_hz;
2809 /* Returns tc handle 'major':'minor'. */
2811 tc_make_handle(unsigned int major, unsigned int minor)
2813 return TC_H_MAKE(major << 16, minor);
2816 /* Returns the major number from 'handle'. */
2818 tc_get_major(unsigned int handle)
2820 return TC_H_MAJ(handle) >> 16;
2823 /* Returns the minor number from 'handle'. */
2825 tc_get_minor(unsigned int handle)
2827 return TC_H_MIN(handle);
2830 static struct tcmsg *
2831 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
2832 struct ofpbuf *request)
2834 struct tcmsg *tcmsg;
2838 error = get_ifindex(netdev, &ifindex);
2843 ofpbuf_init(request, 512);
2844 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
2845 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
2846 tcmsg->tcm_family = AF_UNSPEC;
2847 tcmsg->tcm_ifindex = ifindex;
2848 /* Caller should fill in tcmsg->tcm_handle. */
2849 /* Caller should fill in tcmsg->tcm_parent. */
2855 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
2857 int error = nl_sock_transact(rtnl_sock, request, replyp);
2858 ofpbuf_uninit(request);
2865 /* The values in psched are not individually very meaningful, but they are
2866 * important. The tables below show some values seen in the wild.
2870 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
2871 * (Before that, there are hints that it was 1000000000.)
2873 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
2877 * -----------------------------------
2878 * [1] 000c8000 000f4240 000f4240 00000064
2879 * [2] 000003e8 00000400 000f4240 3b9aca00
2880 * [3] 000003e8 00000400 000f4240 3b9aca00
2881 * [4] 000003e8 00000400 000f4240 00000064
2882 * [5] 000003e8 00000040 000f4240 3b9aca00
2883 * [6] 000003e8 00000040 000f4240 000000f9
2885 * a b c d ticks_per_s buffer_hz
2886 * ------- --------- ---------- ------------- ----------- -------------
2887 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
2888 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2889 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2890 * [4] 1,000 1,024 1,000,000 100 976,562 100
2891 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
2892 * [6] 1,000 64 1,000,000 249 15,625,000 249
2894 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
2895 * [2] 2.6.26-1-686-bigmem from Debian lenny
2896 * [3] 2.6.26-2-sparc64 from Debian lenny
2897 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
2898 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
2899 * [6] 2.6.34 from kernel.org on KVM
2901 static const char fn[] = "/proc/net/psched";
2902 unsigned int a, b, c, d;
2908 stream = fopen(fn, "r");
2910 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
2914 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
2915 VLOG_WARN("%s: read failed", fn);
2919 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
2923 VLOG_WARN("%s: invalid scheduler parameters", fn);
2927 ticks_per_s = (double) a * c / b;
2931 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
2934 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
2937 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
2938 * rate of 'rate' bytes per second. */
2940 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
2945 return (rate * ticks) / ticks_per_s;
2948 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
2949 * rate of 'rate' bytes per second. */
2951 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
2956 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
2959 /* Returns the number of bytes that need to be reserved for qdisc buffering at
2960 * a transmission rate of 'rate' bytes per second. */
2962 tc_buffer_per_jiffy(unsigned int rate)
2967 return rate / buffer_hz;
2970 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
2971 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
2972 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
2973 * stores NULL into it if it is absent.
2975 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
2978 * Returns 0 if successful, otherwise a positive errno value. */
2980 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
2981 struct nlattr **options)
2983 static const struct nl_policy tca_policy[] = {
2984 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
2985 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
2987 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2989 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2990 tca_policy, ta, ARRAY_SIZE(ta))) {
2991 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
2996 *kind = nl_attr_get_string(ta[TCA_KIND]);
3000 *options = ta[TCA_OPTIONS];
3015 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3016 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3017 * into '*options', and its queue statistics into '*stats'. Any of the output
3018 * arguments may be null.
3020 * Returns 0 if successful, otherwise a positive errno value. */
3022 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3023 struct nlattr **options, struct netdev_queue_stats *stats)
3025 static const struct nl_policy tca_policy[] = {
3026 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3027 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3029 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3031 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3032 tca_policy, ta, ARRAY_SIZE(ta))) {
3033 VLOG_WARN_RL(&rl, "failed to parse class message");
3038 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3039 *handlep = tc->tcm_handle;
3043 *options = ta[TCA_OPTIONS];
3047 const struct gnet_stats_queue *gsq;
3048 struct gnet_stats_basic gsb;
3050 static const struct nl_policy stats_policy[] = {
3051 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3052 .min_len = sizeof gsb },
3053 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3054 .min_len = sizeof *gsq },
3056 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3058 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3059 sa, ARRAY_SIZE(sa))) {
3060 VLOG_WARN_RL(&rl, "failed to parse class stats");
3064 /* Alignment issues screw up the length of struct gnet_stats_basic on
3065 * some arch/bitsize combinations. Newer versions of Linux have a
3066 * struct gnet_stats_basic_packed, but we can't depend on that. The
3067 * easiest thing to do is just to make a copy. */
3068 memset(&gsb, 0, sizeof gsb);
3069 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3070 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3071 stats->tx_bytes = gsb.bytes;
3072 stats->tx_packets = gsb.packets;
3074 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3075 stats->tx_errors = gsq->drops;
3085 memset(stats, 0, sizeof *stats);
3090 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3093 tc_query_class(const struct netdev *netdev,
3094 unsigned int handle, unsigned int parent,
3095 struct ofpbuf **replyp)
3097 struct ofpbuf request;
3098 struct tcmsg *tcmsg;
3101 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3105 tcmsg->tcm_handle = handle;
3106 tcmsg->tcm_parent = parent;
3108 error = tc_transact(&request, replyp);
3110 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3111 netdev_get_name(netdev),
3112 tc_get_major(handle), tc_get_minor(handle),
3113 tc_get_major(parent), tc_get_minor(parent),
3119 /* Equivalent to "tc class del dev <name> handle <handle>". */
3121 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3123 struct ofpbuf request;
3124 struct tcmsg *tcmsg;
3127 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3131 tcmsg->tcm_handle = handle;
3132 tcmsg->tcm_parent = 0;
3134 error = tc_transact(&request, NULL);
3136 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3137 netdev_get_name(netdev),
3138 tc_get_major(handle), tc_get_minor(handle),
3144 /* Equivalent to "tc qdisc del dev <name> root". */
3146 tc_del_qdisc(struct netdev *netdev)
3148 struct netdev_dev_linux *netdev_dev =
3149 netdev_dev_linux_cast(netdev_get_dev(netdev));
3150 struct ofpbuf request;
3151 struct tcmsg *tcmsg;
3154 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3158 tcmsg->tcm_handle = tc_make_handle(1, 0);
3159 tcmsg->tcm_parent = TC_H_ROOT;
3161 error = tc_transact(&request, NULL);
3162 if (error == EINVAL) {
3163 /* EINVAL probably means that the default qdisc was in use, in which
3164 * case we've accomplished our purpose. */
3167 if (!error && netdev_dev->tc) {
3168 if (netdev_dev->tc->ops->tc_destroy) {
3169 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3171 netdev_dev->tc = NULL;
3176 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3177 * kernel to determine what they are. Returns 0 if successful, otherwise a
3178 * positive errno value. */
3180 tc_query_qdisc(const struct netdev *netdev)
3182 struct netdev_dev_linux *netdev_dev =
3183 netdev_dev_linux_cast(netdev_get_dev(netdev));
3184 struct ofpbuf request, *qdisc;
3185 const struct tc_ops *ops;
3186 struct tcmsg *tcmsg;
3190 if (netdev_dev->tc) {
3194 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3195 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3196 * 2.6.35 without that fix backported to it.
3198 * To avoid the OOPS, we must not make a request that would attempt to dump
3199 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3200 * few others. There are a few ways that I can see to do this, but most of
3201 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3202 * technique chosen here is to assume that any non-default qdisc that we
3203 * create will have a class with handle 1:0. The built-in qdiscs only have
3204 * a class with handle 0:0.
3206 * We could check for Linux 2.6.35+ and use a more straightforward method
3208 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3212 tcmsg->tcm_handle = tc_make_handle(1, 0);
3213 tcmsg->tcm_parent = 0;
3215 /* Figure out what tc class to instantiate. */
3216 error = tc_transact(&request, &qdisc);
3220 error = tc_parse_qdisc(qdisc, &kind, NULL);
3222 ops = &tc_ops_other;
3224 ops = tc_lookup_linux_name(kind);
3226 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3227 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3229 ops = &tc_ops_other;
3232 } else if (error == ENOENT) {
3233 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3234 * other entity that doesn't have a handle 1:0. We will assume
3235 * that it's the system default qdisc. */
3236 ops = &tc_ops_default;
3239 /* Who knows? Maybe the device got deleted. */
3240 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3241 netdev_get_name(netdev), strerror(error));
3242 ops = &tc_ops_other;
3245 /* Instantiate it. */
3246 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3247 assert((load_error == 0) == (netdev_dev->tc != NULL));
3248 ofpbuf_delete(qdisc);
3250 return error ? error : load_error;
3253 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3254 approximate the time to transmit packets of various lengths. For an MTU of
3255 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3256 represents two possible packet lengths; for a MTU of 513 through 1024, four
3257 possible lengths; and so on.
3259 Returns, for the specified 'mtu', the number of bits that packet lengths
3260 need to be shifted right to fit within such a 256-entry table. */
3262 tc_calc_cell_log(unsigned int mtu)
3267 mtu = ETH_PAYLOAD_MAX;
3269 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3271 for (cell_log = 0; mtu >= 256; cell_log++) {
3278 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3281 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3283 memset(rate, 0, sizeof *rate);
3284 rate->cell_log = tc_calc_cell_log(mtu);
3285 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3286 /* rate->cell_align = 0; */ /* distro headers. */
3287 rate->mpu = ETH_TOTAL_MIN;
3291 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3292 * attribute of the specified "type".
3294 * See tc_calc_cell_log() above for a description of "rtab"s. */
3296 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3301 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3302 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3303 unsigned packet_size = (i + 1) << rate->cell_log;
3304 if (packet_size < rate->mpu) {
3305 packet_size = rate->mpu;
3307 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3311 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3312 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3313 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3316 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3318 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3319 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3323 /* Utility functions. */
3326 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3328 /* Policy for RTNLGRP_LINK messages.
3330 * There are *many* more fields in these messages, but currently we only
3331 * care about these fields. */
3332 static const struct nl_policy rtnlgrp_link_policy[] = {
3333 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3334 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3335 .min_len = sizeof(struct rtnl_link_stats) },
3338 struct ofpbuf request;
3339 struct ofpbuf *reply;
3340 struct ifinfomsg *ifi;
3341 const struct rtnl_link_stats *rtnl_stats;
3342 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3345 ofpbuf_init(&request, 0);
3346 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3347 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3348 ifi->ifi_family = PF_UNSPEC;
3349 ifi->ifi_index = ifindex;
3350 error = nl_sock_transact(rtnl_sock, &request, &reply);
3351 ofpbuf_uninit(&request);
3356 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3357 rtnlgrp_link_policy,
3358 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3359 ofpbuf_delete(reply);
3363 if (!attrs[IFLA_STATS]) {
3364 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3365 ofpbuf_delete(reply);
3369 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3370 stats->rx_packets = rtnl_stats->rx_packets;
3371 stats->tx_packets = rtnl_stats->tx_packets;
3372 stats->rx_bytes = rtnl_stats->rx_bytes;
3373 stats->tx_bytes = rtnl_stats->tx_bytes;
3374 stats->rx_errors = rtnl_stats->rx_errors;
3375 stats->tx_errors = rtnl_stats->tx_errors;
3376 stats->rx_dropped = rtnl_stats->rx_dropped;
3377 stats->tx_dropped = rtnl_stats->tx_dropped;
3378 stats->multicast = rtnl_stats->multicast;
3379 stats->collisions = rtnl_stats->collisions;
3380 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3381 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3382 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3383 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3384 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3385 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3386 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3387 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3388 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3389 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3390 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3392 ofpbuf_delete(reply);
3398 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3400 static const char fn[] = "/proc/net/dev";
3405 stream = fopen(fn, "r");
3407 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3412 while (fgets(line, sizeof line, stream)) {
3415 #define X64 "%"SCNu64
3418 X64 X64 X64 X64 X64 X64 X64 "%*u"
3419 X64 X64 X64 X64 X64 X64 X64 "%*u",
3425 &stats->rx_fifo_errors,
3426 &stats->rx_frame_errors,
3432 &stats->tx_fifo_errors,
3434 &stats->tx_carrier_errors) != 15) {
3435 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3436 } else if (!strcmp(devname, netdev_name)) {
3437 stats->rx_length_errors = UINT64_MAX;
3438 stats->rx_over_errors = UINT64_MAX;
3439 stats->rx_crc_errors = UINT64_MAX;
3440 stats->rx_missed_errors = UINT64_MAX;
3441 stats->tx_aborted_errors = UINT64_MAX;
3442 stats->tx_heartbeat_errors = UINT64_MAX;
3443 stats->tx_window_errors = UINT64_MAX;
3449 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3455 get_flags(const struct netdev *netdev, int *flags)
3460 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3462 *flags = ifr.ifr_flags;
3467 set_flags(struct netdev *netdev, int flags)
3471 ifr.ifr_flags = flags;
3472 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3477 do_get_ifindex(const char *netdev_name)
3481 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3482 COVERAGE_INC(netdev_get_ifindex);
3483 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3484 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3485 netdev_name, strerror(errno));
3488 return ifr.ifr_ifindex;
3492 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3494 struct netdev_dev_linux *netdev_dev =
3495 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3497 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3498 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3502 netdev_dev->cache_valid |= VALID_IFINDEX;
3503 netdev_dev->ifindex = ifindex;
3505 *ifindexp = netdev_dev->ifindex;
3510 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
3515 memset(&ifr, 0, sizeof ifr);
3516 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3517 COVERAGE_INC(netdev_get_hwaddr);
3518 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
3519 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
3520 netdev_name, strerror(errno));
3523 hwaddr_family = ifr.ifr_hwaddr.sa_family;
3524 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
3525 VLOG_WARN("%s device has unknown hardware address family %d",
3526 netdev_name, hwaddr_family);
3528 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
3533 set_etheraddr(const char *netdev_name, int hwaddr_family,
3534 const uint8_t mac[ETH_ADDR_LEN])
3538 memset(&ifr, 0, sizeof ifr);
3539 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3540 ifr.ifr_hwaddr.sa_family = hwaddr_family;
3541 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
3542 COVERAGE_INC(netdev_set_hwaddr);
3543 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
3544 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
3545 netdev_name, strerror(errno));
3552 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
3553 int cmd, const char *cmd_name)
3557 memset(&ifr, 0, sizeof ifr);
3558 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
3559 ifr.ifr_data = (caddr_t) ecmd;
3562 COVERAGE_INC(netdev_ethtool);
3563 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
3566 if (errno != EOPNOTSUPP) {
3567 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
3568 "failed: %s", cmd_name, name, strerror(errno));
3570 /* The device doesn't support this operation. That's pretty
3571 * common, so there's no point in logging anything. */
3578 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
3579 const char *cmd_name)
3581 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
3582 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
3583 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
3591 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
3592 int cmd, const char *cmd_name)
3597 ifr.ifr_addr.sa_family = AF_INET;
3598 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
3600 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
3601 *ip = sin->sin_addr;