2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
51 #include "netdev-provider.h"
52 #include "netdev-vport.h"
55 #include "openflow/openflow.h"
56 #include "openvswitch/gre.h"
58 #include "poll-loop.h"
59 #include "port-array.h"
60 #include "rtnetlink.h"
61 #include "socket-util.h"
66 VLOG_DEFINE_THIS_MODULE(netdev_linux)
68 /* These were introduced in Linux 2.6.14, so they might be missing if we have
70 #ifndef ADVERTISED_Pause
71 #define ADVERTISED_Pause (1 << 13)
73 #ifndef ADVERTISED_Asym_Pause
74 #define ADVERTISED_Asym_Pause (1 << 14)
77 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
80 #define TC_RTAB_SIZE 1024
83 static struct rtnetlink_notifier netdev_linux_cache_notifier;
84 static int cache_notifier_refcount;
87 VALID_IFINDEX = 1 << 0,
88 VALID_ETHERADDR = 1 << 1,
92 VALID_CARRIER = 1 << 5,
93 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
94 VALID_POLICING = 1 << 7,
95 VALID_HAVE_VPORT_STATS = 1 << 8
103 /* Traffic control. */
105 /* An instance of a traffic control class. Always associated with a particular
108 const struct tc_ops *ops;
110 /* Maps from queue ID to tc-specific data.
112 * The generic netdev TC layer uses this to the following extent: if an
113 * entry is nonnull, then the queue whose ID is the index is assumed to
114 * exist; if an entry is null, then that queue is assumed not to exist.
115 * Implementations must adhere to this scheme, although they may store
116 * whatever they like as data.
118 struct port_array queues;
121 /* A particular kind of traffic control. Each implementation generally maps to
122 * one particular Linux qdisc class.
124 * The functions below return 0 if successful or a positive errno value on
125 * failure, except where otherwise noted. All of them must be provided, except
126 * where otherwise noted. */
128 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
129 * This is null for tc_ops_default and tc_ops_other, for which there are no
130 * appropriate values. */
131 const char *linux_name;
133 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
134 const char *ovs_name;
136 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
137 * queues. The queues are numbered 0 through n_queues - 1. */
138 unsigned int n_queues;
140 /* Called to install this TC class on 'netdev'. The implementation should
141 * make the Netlink calls required to set up 'netdev' with the right qdisc
142 * and configure it according to 'details'. The implementation may assume
143 * that the current qdisc is the default; that is, there is no need for it
144 * to delete the current qdisc before installing itself.
146 * The contents of 'details' should be documented as valid for 'ovs_name'
147 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
148 * (which is built as ovs-vswitchd.conf.db(8)).
150 * This function must return 0 if and only if it sets 'netdev->tc' to an
151 * initialized 'struct tc'.
153 * (This function is null for tc_ops_other, which cannot be installed. For
154 * other TC classes it should always be nonnull.) */
155 int (*tc_install)(struct netdev *netdev, const struct shash *details);
157 /* Called when the netdev code determines (through a Netlink query) that
158 * this TC class's qdisc is installed on 'netdev', but we didn't install
159 * it ourselves and so don't know any of the details.
161 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
162 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
163 * implementation should parse the other attributes of 'nlmsg' as
164 * necessary to determine its configuration. If necessary it should also
165 * use Netlink queries to determine the configuration of queues on
168 * This function must return 0 if and only if it sets 'netdev->tc' to an
169 * initialized 'struct tc'. */
170 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
172 /* Destroys the data structures allocated by the implementation as part of
173 * 'tc'. (This includes destroying 'tc->queues' by calling
176 * The implementation should not need to perform any Netlink calls. If
177 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
178 * (But it may not be desirable.)
180 * This function may be null if 'tc' is trivial. */
181 void (*tc_destroy)(struct tc *tc);
183 /* Retrieves details of 'netdev->tc' configuration into 'details'.
185 * The implementation should not need to perform any Netlink calls, because
186 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
187 * cached the configuration.
189 * The contents of 'details' should be documented as valid for 'ovs_name'
190 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
191 * (which is built as ovs-vswitchd.conf.db(8)).
193 * This function may be null if 'tc' is not configurable.
195 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
197 /* Reconfigures 'netdev->tc' according to 'details', performing any
198 * required Netlink calls to complete the reconfiguration.
200 * The contents of 'details' should be documented as valid for 'ovs_name'
201 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
202 * (which is built as ovs-vswitchd.conf.db(8)).
204 * This function may be null if 'tc' is not configurable.
206 int (*qdisc_set)(struct netdev *, const struct shash *details);
208 /* Retrieves details of 'queue_id' on 'netdev->tc' into 'details'. The
209 * caller ensures that 'queues' has a nonnull value for index 'queue_id.
211 * The contents of 'details' should be documented as valid for 'ovs_name'
212 * in the "other_config" column in the "Queue" table in
213 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the queue configuration.
219 * This function may be null if 'tc' does not have queues ('n_queues' is
221 int (*class_get)(const struct netdev *netdev, unsigned int queue_id,
222 struct shash *details);
224 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
225 * 'details', perfoming any required Netlink calls to complete the
226 * reconfiguration. The caller ensures that 'queue_id' is less than
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "Queue" table in
231 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
233 * This function may be null if 'tc' does not have queues or its queues are
234 * not configurable. */
235 int (*class_set)(struct netdev *, unsigned int queue_id,
236 const struct shash *details);
238 /* Deletes 'queue_id' from 'netdev->tc'. The caller ensures that 'queues'
239 * has a nonnull value for index 'queue_id.
241 * This function may be null if 'tc' does not have queues or its queues
242 * cannot be deleted. */
243 int (*class_delete)(struct netdev *, unsigned int queue_id);
245 /* Obtains stats for 'queue' from 'netdev->tc'. The caller ensures that
246 * 'queues' has a nonnull value for index 'queue_id.
248 * On success, initializes '*stats'.
250 * This function may be null if 'tc' does not have queues or if it cannot
251 * report queue statistics. */
252 int (*class_get_stats)(const struct netdev *netdev, unsigned int queue_id,
253 struct netdev_queue_stats *stats);
255 /* Extracts queue stats from 'nlmsg', which is a response to a
256 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
258 * This function may be null if 'tc' does not have queues or if it cannot
259 * report queue statistics. */
260 int (*class_dump_stats)(const struct netdev *netdev,
261 const struct ofpbuf *nlmsg,
262 netdev_dump_queue_stats_cb *cb, void *aux);
266 tc_init(struct tc *tc, const struct tc_ops *ops)
269 port_array_init(&tc->queues);
273 tc_destroy(struct tc *tc)
275 port_array_destroy(&tc->queues);
278 static const struct tc_ops tc_ops_htb;
279 static const struct tc_ops tc_ops_default;
280 static const struct tc_ops tc_ops_other;
282 static const struct tc_ops *tcs[] = {
283 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
284 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
285 &tc_ops_other, /* Some other qdisc. */
289 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
290 static unsigned int tc_get_major(unsigned int handle);
291 static unsigned int tc_get_minor(unsigned int handle);
293 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
294 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
295 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
297 static struct tcmsg *tc_make_request(const struct netdev *, int type,
298 unsigned int flags, struct ofpbuf *);
299 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
301 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
302 struct nlattr **options);
303 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
304 struct nlattr **options,
305 struct netdev_queue_stats *);
306 static int tc_query_class(const struct netdev *,
307 unsigned int handle, unsigned int parent,
308 struct ofpbuf **replyp);
309 static int tc_delete_class(const struct netdev *, unsigned int handle);
311 static int tc_del_qdisc(struct netdev *netdev);
312 static int tc_query_qdisc(const struct netdev *netdev);
314 static int tc_calc_cell_log(unsigned int mtu);
315 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
316 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
317 const struct tc_ratespec *rate);
318 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
320 struct netdev_dev_linux {
321 struct netdev_dev netdev_dev;
323 struct shash_node *shash_node;
324 unsigned int cache_valid;
326 /* The following are figured out "on demand" only. They are only valid
327 * when the corresponding VALID_* bit in 'cache_valid' is set. */
329 uint8_t etheraddr[ETH_ADDR_LEN];
330 struct in_addr address, netmask;
334 bool is_internal; /* Is this an openvswitch internal device? */
335 bool is_tap; /* Is this a tuntap device? */
336 uint32_t kbits_rate; /* Policing data. */
337 uint32_t kbits_burst;
338 bool have_vport_stats;
342 struct tap_state tap;
346 struct netdev_linux {
347 struct netdev netdev;
351 /* An AF_INET socket (used for ioctl operations). */
352 static int af_inet_sock = -1;
354 /* A Netlink routing socket that is not subscribed to any multicast groups. */
355 static struct nl_sock *rtnl_sock;
357 struct netdev_linux_notifier {
358 struct netdev_notifier notifier;
362 static struct shash netdev_linux_notifiers =
363 SHASH_INITIALIZER(&netdev_linux_notifiers);
364 static struct rtnetlink_notifier netdev_linux_poll_notifier;
366 /* This is set pretty low because we probably won't learn anything from the
367 * additional log messages. */
368 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
370 static int netdev_linux_init(void);
372 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
373 int cmd, const char *cmd_name);
374 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
375 const char *cmd_name);
376 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
377 int cmd, const char *cmd_name);
378 static int get_flags(const struct netdev *, int *flagsp);
379 static int set_flags(struct netdev *, int flags);
380 static int do_get_ifindex(const char *netdev_name);
381 static int get_ifindex(const struct netdev *, int *ifindexp);
382 static int do_set_addr(struct netdev *netdev,
383 int ioctl_nr, const char *ioctl_name,
384 struct in_addr addr);
385 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
386 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
387 const uint8_t[ETH_ADDR_LEN]);
388 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
389 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
392 is_netdev_linux_class(const struct netdev_class *netdev_class)
394 return netdev_class->init == netdev_linux_init;
397 static struct netdev_dev_linux *
398 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
400 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
401 assert(is_netdev_linux_class(netdev_class));
403 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
406 static struct netdev_linux *
407 netdev_linux_cast(const struct netdev *netdev)
409 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
410 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
411 assert(is_netdev_linux_class(netdev_class));
413 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
417 netdev_linux_init(void)
419 static int status = -1;
421 /* Create AF_INET socket. */
422 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
423 status = af_inet_sock >= 0 ? 0 : errno;
425 VLOG_ERR("failed to create inet socket: %s", strerror(status));
428 /* Create rtnetlink socket. */
430 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
432 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
441 netdev_linux_run(void)
443 rtnetlink_notifier_run();
447 netdev_linux_wait(void)
449 rtnetlink_notifier_wait();
453 netdev_linux_cache_cb(const struct rtnetlink_change *change,
454 void *aux OVS_UNUSED)
456 struct netdev_dev_linux *dev;
458 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
460 const struct netdev_class *netdev_class =
461 netdev_dev_get_class(base_dev);
463 if (is_netdev_linux_class(netdev_class)) {
464 dev = netdev_dev_linux_cast(base_dev);
465 dev->cache_valid = 0;
469 struct shash device_shash;
470 struct shash_node *node;
472 shash_init(&device_shash);
473 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
474 SHASH_FOR_EACH (node, &device_shash) {
476 dev->cache_valid = 0;
478 shash_destroy(&device_shash);
482 /* Creates the netdev device of 'type' with 'name'. */
484 netdev_linux_create_system(const char *name, const char *type OVS_UNUSED,
485 const struct shash *args, struct netdev_dev **netdev_devp)
487 struct netdev_dev_linux *netdev_dev;
490 if (!shash_is_empty(args)) {
491 VLOG_WARN("%s: arguments for system devices should be empty", name);
494 if (!cache_notifier_refcount) {
495 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
496 netdev_linux_cache_cb, NULL);
501 cache_notifier_refcount++;
503 netdev_dev = xzalloc(sizeof *netdev_dev);
504 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
506 *netdev_devp = &netdev_dev->netdev_dev;
510 /* For most types of netdevs we open the device for each call of
511 * netdev_open(). However, this is not the case with tap devices,
512 * since it is only possible to open the device once. In this
513 * situation we share a single file descriptor, and consequently
514 * buffers, across all readers. Therefore once data is read it will
515 * be unavailable to other reads for tap devices. */
517 netdev_linux_create_tap(const char *name, const char *type OVS_UNUSED,
518 const struct shash *args, struct netdev_dev **netdev_devp)
520 struct netdev_dev_linux *netdev_dev;
521 struct tap_state *state;
522 static const char tap_dev[] = "/dev/net/tun";
526 if (!shash_is_empty(args)) {
527 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
530 netdev_dev = xzalloc(sizeof *netdev_dev);
531 state = &netdev_dev->state.tap;
533 /* Open tap device. */
534 state->fd = open(tap_dev, O_RDWR);
537 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
541 /* Create tap device. */
542 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
543 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
544 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
545 VLOG_WARN("%s: creating tap device failed: %s", name,
551 /* Make non-blocking. */
552 error = set_nonblocking(state->fd);
557 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
558 *netdev_devp = &netdev_dev->netdev_dev;
567 destroy_tap(struct netdev_dev_linux *netdev_dev)
569 struct tap_state *state = &netdev_dev->state.tap;
571 if (state->fd >= 0) {
576 /* Destroys the netdev device 'netdev_dev_'. */
578 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
580 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
581 const char *type = netdev_dev_get_type(netdev_dev_);
583 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
584 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
587 if (!strcmp(type, "system")) {
588 cache_notifier_refcount--;
590 if (!cache_notifier_refcount) {
591 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
593 } else if (!strcmp(type, "tap")) {
594 destroy_tap(netdev_dev);
601 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
602 struct netdev **netdevp)
604 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
605 struct netdev_linux *netdev;
606 enum netdev_flags flags;
609 /* Allocate network device. */
610 netdev = xzalloc(sizeof *netdev);
612 netdev_init(&netdev->netdev, netdev_dev_);
614 error = netdev_get_flags(&netdev->netdev, &flags);
615 if (error == ENODEV) {
619 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
620 !netdev_dev->state.tap.opened) {
622 /* We assume that the first user of the tap device is the primary user
623 * and give them the tap FD. Subsequent users probably just expect
624 * this to be a system device so open it normally to avoid send/receive
625 * directions appearing to be reversed. */
626 netdev->fd = netdev_dev->state.tap.fd;
627 netdev_dev->state.tap.opened = true;
628 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
629 struct sockaddr_ll sll;
633 /* Create file descriptor. */
634 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
635 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
637 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
638 if (netdev->fd < 0) {
643 /* Set non-blocking mode. */
644 error = set_nonblocking(netdev->fd);
649 /* Get ethernet device index. */
650 error = get_ifindex(&netdev->netdev, &ifindex);
655 /* Bind to specific ethernet device. */
656 memset(&sll, 0, sizeof sll);
657 sll.sll_family = AF_PACKET;
658 sll.sll_ifindex = ifindex;
660 (struct sockaddr *) &sll, sizeof sll) < 0) {
662 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
667 /* Between the socket() and bind() calls above, the socket receives all
668 * packets of the requested type on all system interfaces. We do not
669 * want to receive that data, but there is no way to avoid it. So we
670 * must now drain out the receive queue. */
671 error = drain_rcvbuf(netdev->fd);
677 *netdevp = &netdev->netdev;
681 netdev_uninit(&netdev->netdev, true);
685 /* Closes and destroys 'netdev'. */
687 netdev_linux_close(struct netdev *netdev_)
689 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
691 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
697 /* Initializes 'svec' with a list of the names of all known network devices. */
699 netdev_linux_enumerate(struct svec *svec)
701 struct if_nameindex *names;
703 names = if_nameindex();
707 for (i = 0; names[i].if_name != NULL; i++) {
708 svec_add(svec, names[i].if_name);
710 if_freenameindex(names);
713 VLOG_WARN("could not obtain list of network device names: %s",
720 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
722 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
724 if (netdev->fd < 0) {
725 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
730 ssize_t retval = read(netdev->fd, data, size);
733 } else if (errno != EINTR) {
734 if (errno != EAGAIN) {
735 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
736 strerror(errno), netdev_get_name(netdev_));
743 /* Registers with the poll loop to wake up from the next call to poll_block()
744 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
746 netdev_linux_recv_wait(struct netdev *netdev_)
748 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
749 if (netdev->fd >= 0) {
750 poll_fd_wait(netdev->fd, POLLIN);
754 /* Discards all packets waiting to be received from 'netdev'. */
756 netdev_linux_drain(struct netdev *netdev_)
758 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
759 if (netdev->fd < 0) {
761 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
763 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
764 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
768 drain_fd(netdev->fd, ifr.ifr_qlen);
771 return drain_rcvbuf(netdev->fd);
775 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
776 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
777 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
778 * the packet is too big or too small to transmit on the device.
780 * The caller retains ownership of 'buffer' in all cases.
782 * The kernel maintains a packet transmission queue, so the caller is not
783 * expected to do additional queuing of packets. */
785 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
789 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
791 if (netdev->fd < 0) {
796 ssize_t retval = write(netdev->fd, data, size);
798 /* The Linux AF_PACKET implementation never blocks waiting for room
799 * for packets, instead returning ENOBUFS. Translate this into
800 * EAGAIN for the caller. */
801 if (errno == ENOBUFS) {
803 } else if (errno == EINTR) {
805 } else if (errno != EAGAIN) {
806 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
807 netdev_get_name(netdev_), strerror(errno));
810 } else if (retval != size) {
811 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
812 "%zu) on %s", retval, size, netdev_get_name(netdev_));
820 /* Registers with the poll loop to wake up from the next call to poll_block()
821 * when the packet transmission queue has sufficient room to transmit a packet
822 * with netdev_send().
824 * The kernel maintains a packet transmission queue, so the client is not
825 * expected to do additional queuing of packets. Thus, this function is
826 * unlikely to ever be used. It is included for completeness. */
828 netdev_linux_send_wait(struct netdev *netdev_)
830 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
831 if (netdev->fd < 0) {
833 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
834 poll_fd_wait(netdev->fd, POLLOUT);
836 /* TAP device always accepts packets.*/
837 poll_immediate_wake();
841 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
842 * otherwise a positive errno value. */
844 netdev_linux_set_etheraddr(struct netdev *netdev_,
845 const uint8_t mac[ETH_ADDR_LEN])
847 struct netdev_dev_linux *netdev_dev =
848 netdev_dev_linux_cast(netdev_get_dev(netdev_));
851 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
852 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
853 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
855 netdev_dev->cache_valid |= VALID_ETHERADDR;
856 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
864 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
865 * free the returned buffer. */
867 netdev_linux_get_etheraddr(const struct netdev *netdev_,
868 uint8_t mac[ETH_ADDR_LEN])
870 struct netdev_dev_linux *netdev_dev =
871 netdev_dev_linux_cast(netdev_get_dev(netdev_));
872 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
873 int error = get_etheraddr(netdev_get_name(netdev_),
874 netdev_dev->etheraddr);
878 netdev_dev->cache_valid |= VALID_ETHERADDR;
880 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
884 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
885 * in bytes, not including the hardware header; thus, this is typically 1500
886 * bytes for Ethernet devices. */
888 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
890 struct netdev_dev_linux *netdev_dev =
891 netdev_dev_linux_cast(netdev_get_dev(netdev_));
892 if (!(netdev_dev->cache_valid & VALID_MTU)) {
896 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
897 SIOCGIFMTU, "SIOCGIFMTU");
901 netdev_dev->mtu = ifr.ifr_mtu;
902 netdev_dev->cache_valid |= VALID_MTU;
904 *mtup = netdev_dev->mtu;
908 /* Returns the ifindex of 'netdev', if successful, as a positive number.
909 * On failure, returns a negative errno value. */
911 netdev_linux_get_ifindex(const struct netdev *netdev)
915 error = get_ifindex(netdev, &ifindex);
916 return error ? -error : ifindex;
920 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
922 struct netdev_dev_linux *netdev_dev =
923 netdev_dev_linux_cast(netdev_get_dev(netdev_));
928 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
932 fn = xasprintf("/sys/class/net/%s/carrier",
933 netdev_get_name(netdev_));
934 fd = open(fn, O_RDONLY);
937 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
941 retval = read(fd, line, sizeof line);
944 if (error == EINVAL) {
945 /* This is the normal return value when we try to check carrier
946 * if the network device is not up. */
948 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
951 } else if (retval == 0) {
953 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
957 if (line[0] != '0' && line[0] != '1') {
959 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
963 netdev_dev->carrier = line[0] != '0';
964 netdev_dev->cache_valid |= VALID_CARRIER;
966 *carrier = netdev_dev->carrier;
977 /* Check whether we can we use RTM_GETLINK to get network device statistics.
978 * In pre-2.6.19 kernels, this was only available if wireless extensions were
981 check_for_working_netlink_stats(void)
983 /* Decide on the netdev_get_stats() implementation to use. Netlink is
984 * preferable, so if that works, we'll use it. */
985 int ifindex = do_get_ifindex("lo");
987 VLOG_WARN("failed to get ifindex for lo, "
988 "obtaining netdev stats from proc");
991 struct netdev_stats stats;
992 int error = get_stats_via_netlink(ifindex, &stats);
994 VLOG_DBG("obtaining netdev stats via rtnetlink");
997 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
998 "via proc (you are probably running a pre-2.6.19 "
999 "kernel)", strerror(error));
1005 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1007 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1009 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1010 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1011 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1013 netdev_dev->is_tap = !strcmp(type, "tap");
1014 netdev_dev->is_internal = false;
1015 if (!netdev_dev->is_tap) {
1016 struct ethtool_drvinfo drvinfo;
1019 memset(&drvinfo, 0, sizeof drvinfo);
1020 error = netdev_linux_do_ethtool(name,
1021 (struct ethtool_cmd *)&drvinfo,
1023 "ETHTOOL_GDRVINFO");
1025 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1026 netdev_dev->is_internal = true;
1030 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1035 swap_uint64(uint64_t *a, uint64_t *b)
1042 /* Retrieves current device stats for 'netdev'. */
1044 netdev_linux_get_stats(const struct netdev *netdev_,
1045 struct netdev_stats *stats)
1047 struct netdev_dev_linux *netdev_dev =
1048 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1049 static int use_netlink_stats = -1;
1052 COVERAGE_INC(netdev_get_stats);
1054 if (netdev_dev->have_vport_stats ||
1055 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1057 error = netdev_vport_get_stats(netdev_, stats);
1058 netdev_dev->have_vport_stats = !error;
1059 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1062 if (!netdev_dev->have_vport_stats) {
1063 if (use_netlink_stats < 0) {
1064 use_netlink_stats = check_for_working_netlink_stats();
1066 if (use_netlink_stats) {
1069 error = get_ifindex(netdev_, &ifindex);
1071 error = get_stats_via_netlink(ifindex, stats);
1074 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1078 /* If this port is an internal port then the transmit and receive stats
1079 * will appear to be swapped relative to the other ports since we are the
1080 * one sending the data, not a remote computer. For consistency, we swap
1081 * them back here. This does not apply if we are getting stats from the
1082 * vport layer because it always tracks stats from the perspective of the
1084 netdev_linux_update_is_pseudo(netdev_dev);
1085 if (!error && !netdev_dev->have_vport_stats &&
1086 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1087 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1088 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1089 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1090 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1091 stats->rx_length_errors = 0;
1092 stats->rx_over_errors = 0;
1093 stats->rx_crc_errors = 0;
1094 stats->rx_frame_errors = 0;
1095 stats->rx_fifo_errors = 0;
1096 stats->rx_missed_errors = 0;
1097 stats->tx_aborted_errors = 0;
1098 stats->tx_carrier_errors = 0;
1099 stats->tx_fifo_errors = 0;
1100 stats->tx_heartbeat_errors = 0;
1101 stats->tx_window_errors = 0;
1107 /* Stores the features supported by 'netdev' into each of '*current',
1108 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1109 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1110 * successful, otherwise a positive errno value. */
1112 netdev_linux_get_features(struct netdev *netdev,
1113 uint32_t *current, uint32_t *advertised,
1114 uint32_t *supported, uint32_t *peer)
1116 struct ethtool_cmd ecmd;
1119 memset(&ecmd, 0, sizeof ecmd);
1120 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1121 ETHTOOL_GSET, "ETHTOOL_GSET");
1126 /* Supported features. */
1128 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1129 *supported |= OFPPF_10MB_HD;
1131 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1132 *supported |= OFPPF_10MB_FD;
1134 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1135 *supported |= OFPPF_100MB_HD;
1137 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1138 *supported |= OFPPF_100MB_FD;
1140 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1141 *supported |= OFPPF_1GB_HD;
1143 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1144 *supported |= OFPPF_1GB_FD;
1146 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1147 *supported |= OFPPF_10GB_FD;
1149 if (ecmd.supported & SUPPORTED_TP) {
1150 *supported |= OFPPF_COPPER;
1152 if (ecmd.supported & SUPPORTED_FIBRE) {
1153 *supported |= OFPPF_FIBER;
1155 if (ecmd.supported & SUPPORTED_Autoneg) {
1156 *supported |= OFPPF_AUTONEG;
1158 if (ecmd.supported & SUPPORTED_Pause) {
1159 *supported |= OFPPF_PAUSE;
1161 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1162 *supported |= OFPPF_PAUSE_ASYM;
1165 /* Advertised features. */
1167 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1168 *advertised |= OFPPF_10MB_HD;
1170 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1171 *advertised |= OFPPF_10MB_FD;
1173 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1174 *advertised |= OFPPF_100MB_HD;
1176 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1177 *advertised |= OFPPF_100MB_FD;
1179 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1180 *advertised |= OFPPF_1GB_HD;
1182 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1183 *advertised |= OFPPF_1GB_FD;
1185 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1186 *advertised |= OFPPF_10GB_FD;
1188 if (ecmd.advertising & ADVERTISED_TP) {
1189 *advertised |= OFPPF_COPPER;
1191 if (ecmd.advertising & ADVERTISED_FIBRE) {
1192 *advertised |= OFPPF_FIBER;
1194 if (ecmd.advertising & ADVERTISED_Autoneg) {
1195 *advertised |= OFPPF_AUTONEG;
1197 if (ecmd.advertising & ADVERTISED_Pause) {
1198 *advertised |= OFPPF_PAUSE;
1200 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1201 *advertised |= OFPPF_PAUSE_ASYM;
1204 /* Current settings. */
1205 if (ecmd.speed == SPEED_10) {
1206 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1207 } else if (ecmd.speed == SPEED_100) {
1208 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1209 } else if (ecmd.speed == SPEED_1000) {
1210 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1211 } else if (ecmd.speed == SPEED_10000) {
1212 *current = OFPPF_10GB_FD;
1217 if (ecmd.port == PORT_TP) {
1218 *current |= OFPPF_COPPER;
1219 } else if (ecmd.port == PORT_FIBRE) {
1220 *current |= OFPPF_FIBER;
1224 *current |= OFPPF_AUTONEG;
1227 /* Peer advertisements. */
1228 *peer = 0; /* XXX */
1233 /* Set the features advertised by 'netdev' to 'advertise'. */
1235 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1237 struct ethtool_cmd ecmd;
1240 memset(&ecmd, 0, sizeof ecmd);
1241 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1242 ETHTOOL_GSET, "ETHTOOL_GSET");
1247 ecmd.advertising = 0;
1248 if (advertise & OFPPF_10MB_HD) {
1249 ecmd.advertising |= ADVERTISED_10baseT_Half;
1251 if (advertise & OFPPF_10MB_FD) {
1252 ecmd.advertising |= ADVERTISED_10baseT_Full;
1254 if (advertise & OFPPF_100MB_HD) {
1255 ecmd.advertising |= ADVERTISED_100baseT_Half;
1257 if (advertise & OFPPF_100MB_FD) {
1258 ecmd.advertising |= ADVERTISED_100baseT_Full;
1260 if (advertise & OFPPF_1GB_HD) {
1261 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1263 if (advertise & OFPPF_1GB_FD) {
1264 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1266 if (advertise & OFPPF_10GB_FD) {
1267 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1269 if (advertise & OFPPF_COPPER) {
1270 ecmd.advertising |= ADVERTISED_TP;
1272 if (advertise & OFPPF_FIBER) {
1273 ecmd.advertising |= ADVERTISED_FIBRE;
1275 if (advertise & OFPPF_AUTONEG) {
1276 ecmd.advertising |= ADVERTISED_Autoneg;
1278 if (advertise & OFPPF_PAUSE) {
1279 ecmd.advertising |= ADVERTISED_Pause;
1281 if (advertise & OFPPF_PAUSE_ASYM) {
1282 ecmd.advertising |= ADVERTISED_Asym_Pause;
1284 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1285 ETHTOOL_SSET, "ETHTOOL_SSET");
1288 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1289 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1290 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1291 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1292 * sets '*vlan_vid' to -1. */
1294 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1296 const char *netdev_name = netdev_get_name(netdev);
1297 struct ds line = DS_EMPTY_INITIALIZER;
1298 FILE *stream = NULL;
1302 COVERAGE_INC(netdev_get_vlan_vid);
1303 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1304 stream = fopen(fn, "r");
1310 if (ds_get_line(&line, stream)) {
1311 if (ferror(stream)) {
1313 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1316 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1321 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1323 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1324 fn, ds_cstr(&line));
1342 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1343 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1345 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1346 * positive errno value.
1348 * This function is equivalent to running
1349 * /sbin/tc qdisc del dev %s handle ffff: ingress
1350 * but it is much, much faster.
1353 netdev_linux_remove_policing(struct netdev *netdev)
1355 struct netdev_dev_linux *netdev_dev =
1356 netdev_dev_linux_cast(netdev_get_dev(netdev));
1357 const char *netdev_name = netdev_get_name(netdev);
1359 struct ofpbuf request;
1360 struct tcmsg *tcmsg;
1363 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1364 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1365 tcmsg->tcm_parent = TC_H_INGRESS;
1366 nl_msg_put_string(&request, TCA_KIND, "ingress");
1367 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1369 error = tc_transact(&request, NULL);
1370 if (error && error != ENOENT && error != EINVAL) {
1371 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1372 netdev_name, strerror(error));
1376 netdev_dev->kbits_rate = 0;
1377 netdev_dev->kbits_burst = 0;
1378 netdev_dev->cache_valid |= VALID_POLICING;
1382 /* Attempts to set input rate limiting (policing) policy. */
1384 netdev_linux_set_policing(struct netdev *netdev,
1385 uint32_t kbits_rate, uint32_t kbits_burst)
1387 struct netdev_dev_linux *netdev_dev =
1388 netdev_dev_linux_cast(netdev_get_dev(netdev));
1389 const char *netdev_name = netdev_get_name(netdev);
1392 COVERAGE_INC(netdev_set_policing);
1394 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1395 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1396 : kbits_burst); /* Stick with user-specified value. */
1398 if (netdev_dev->cache_valid & VALID_POLICING
1399 && netdev_dev->kbits_rate == kbits_rate
1400 && netdev_dev->kbits_burst == kbits_burst) {
1401 /* Assume that settings haven't changed since we last set them. */
1405 netdev_linux_remove_policing(netdev);
1407 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1408 if (system(command) != 0) {
1409 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1413 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1414 kbits_rate, kbits_burst);
1415 if (system(command) != 0) {
1416 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1421 netdev_dev->kbits_rate = kbits_rate;
1422 netdev_dev->kbits_burst = kbits_burst;
1423 netdev_dev->cache_valid |= VALID_POLICING;
1430 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1433 const struct tc_ops **opsp;
1435 for (opsp = tcs; *opsp != NULL; opsp++) {
1436 const struct tc_ops *ops = *opsp;
1437 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1438 svec_add(types, ops->ovs_name);
1444 static const struct tc_ops *
1445 tc_lookup_ovs_name(const char *name)
1447 const struct tc_ops **opsp;
1449 for (opsp = tcs; *opsp != NULL; opsp++) {
1450 const struct tc_ops *ops = *opsp;
1451 if (!strcmp(name, ops->ovs_name)) {
1458 static const struct tc_ops *
1459 tc_lookup_linux_name(const char *name)
1461 const struct tc_ops **opsp;
1463 for (opsp = tcs; *opsp != NULL; opsp++) {
1464 const struct tc_ops *ops = *opsp;
1465 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1473 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1475 struct netdev_qos_capabilities *caps)
1477 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1481 caps->n_queues = ops->n_queues;
1486 netdev_linux_get_qos(const struct netdev *netdev,
1487 const char **typep, struct shash *details)
1489 struct netdev_dev_linux *netdev_dev =
1490 netdev_dev_linux_cast(netdev_get_dev(netdev));
1493 error = tc_query_qdisc(netdev);
1498 *typep = netdev_dev->tc->ops->ovs_name;
1499 return (netdev_dev->tc->ops->qdisc_get
1500 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1505 netdev_linux_set_qos(struct netdev *netdev,
1506 const char *type, const struct shash *details)
1508 struct netdev_dev_linux *netdev_dev =
1509 netdev_dev_linux_cast(netdev_get_dev(netdev));
1510 const struct tc_ops *new_ops;
1513 new_ops = tc_lookup_ovs_name(type);
1514 if (!new_ops || !new_ops->tc_install) {
1518 error = tc_query_qdisc(netdev);
1523 if (new_ops == netdev_dev->tc->ops) {
1524 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1526 /* Delete existing qdisc. */
1527 error = tc_del_qdisc(netdev);
1531 assert(netdev_dev->tc == NULL);
1533 /* Install new qdisc. */
1534 error = new_ops->tc_install(netdev, details);
1535 assert((error == 0) == (netdev_dev->tc != NULL));
1542 netdev_linux_get_queue(const struct netdev *netdev,
1543 unsigned int queue_id, struct shash *details)
1545 struct netdev_dev_linux *netdev_dev =
1546 netdev_dev_linux_cast(netdev_get_dev(netdev));
1549 error = tc_query_qdisc(netdev);
1552 } else if (queue_id > UINT16_MAX
1553 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1557 return netdev_dev->tc->ops->class_get(netdev, queue_id, details);
1561 netdev_linux_set_queue(struct netdev *netdev,
1562 unsigned int queue_id, const struct shash *details)
1564 struct netdev_dev_linux *netdev_dev =
1565 netdev_dev_linux_cast(netdev_get_dev(netdev));
1568 error = tc_query_qdisc(netdev);
1571 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1572 || !netdev_dev->tc->ops->class_set) {
1576 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1580 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1582 struct netdev_dev_linux *netdev_dev =
1583 netdev_dev_linux_cast(netdev_get_dev(netdev));
1586 error = tc_query_qdisc(netdev);
1589 } else if (!netdev_dev->tc->ops->class_delete) {
1591 } else if (queue_id > UINT16_MAX
1592 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1596 return netdev_dev->tc->ops->class_delete(netdev, queue_id);
1600 netdev_linux_get_queue_stats(const struct netdev *netdev,
1601 unsigned int queue_id,
1602 struct netdev_queue_stats *stats)
1604 struct netdev_dev_linux *netdev_dev =
1605 netdev_dev_linux_cast(netdev_get_dev(netdev));
1608 error = tc_query_qdisc(netdev);
1611 } else if (queue_id > UINT16_MAX
1612 || !port_array_get(&netdev_dev->tc->queues, queue_id)) {
1614 } else if (!netdev_dev->tc->ops->class_get_stats) {
1618 return netdev_dev->tc->ops->class_get_stats(netdev, queue_id, stats);
1622 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1624 struct ofpbuf request;
1625 struct tcmsg *tcmsg;
1627 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1628 tcmsg->tcm_parent = 0;
1629 nl_dump_start(dump, rtnl_sock, &request);
1630 ofpbuf_uninit(&request);
1634 netdev_linux_dump_queues(const struct netdev *netdev,
1635 netdev_dump_queues_cb *cb, void *aux)
1637 struct netdev_dev_linux *netdev_dev =
1638 netdev_dev_linux_cast(netdev_get_dev(netdev));
1639 unsigned int queue_id;
1640 struct shash details;
1645 error = tc_query_qdisc(netdev);
1648 } else if (!netdev_dev->tc->ops->class_get) {
1653 shash_init(&details);
1654 PORT_ARRAY_FOR_EACH (queue, &netdev_dev->tc->queues, queue_id) {
1655 shash_clear(&details);
1657 error = netdev_dev->tc->ops->class_get(netdev, queue_id, &details);
1659 (*cb)(queue_id, &details, aux);
1664 shash_destroy(&details);
1670 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1671 netdev_dump_queue_stats_cb *cb, void *aux)
1673 struct netdev_dev_linux *netdev_dev =
1674 netdev_dev_linux_cast(netdev_get_dev(netdev));
1675 struct nl_dump dump;
1680 error = tc_query_qdisc(netdev);
1683 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1688 start_queue_dump(netdev, &dump);
1689 while (nl_dump_next(&dump, &msg)) {
1690 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1696 error = nl_dump_done(&dump);
1697 return error ? error : last_error;
1701 netdev_linux_get_in4(const struct netdev *netdev_,
1702 struct in_addr *address, struct in_addr *netmask)
1704 struct netdev_dev_linux *netdev_dev =
1705 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1707 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1710 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1711 SIOCGIFADDR, "SIOCGIFADDR");
1716 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1717 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1722 netdev_dev->cache_valid |= VALID_IN4;
1724 *address = netdev_dev->address;
1725 *netmask = netdev_dev->netmask;
1726 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1730 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1731 struct in_addr netmask)
1733 struct netdev_dev_linux *netdev_dev =
1734 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1737 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1739 netdev_dev->cache_valid |= VALID_IN4;
1740 netdev_dev->address = address;
1741 netdev_dev->netmask = netmask;
1742 if (address.s_addr != INADDR_ANY) {
1743 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1744 "SIOCSIFNETMASK", netmask);
1751 parse_if_inet6_line(const char *line,
1752 struct in6_addr *in6, char ifname[16 + 1])
1754 uint8_t *s6 = in6->s6_addr;
1755 #define X8 "%2"SCNx8
1757 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1758 "%*x %*x %*x %*x %16s\n",
1759 &s6[0], &s6[1], &s6[2], &s6[3],
1760 &s6[4], &s6[5], &s6[6], &s6[7],
1761 &s6[8], &s6[9], &s6[10], &s6[11],
1762 &s6[12], &s6[13], &s6[14], &s6[15],
1766 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1767 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1769 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1771 struct netdev_dev_linux *netdev_dev =
1772 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1773 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1777 netdev_dev->in6 = in6addr_any;
1779 file = fopen("/proc/net/if_inet6", "r");
1781 const char *name = netdev_get_name(netdev_);
1782 while (fgets(line, sizeof line, file)) {
1783 struct in6_addr in6;
1784 char ifname[16 + 1];
1785 if (parse_if_inet6_line(line, &in6, ifname)
1786 && !strcmp(name, ifname))
1788 netdev_dev->in6 = in6;
1794 netdev_dev->cache_valid |= VALID_IN6;
1796 *in6 = netdev_dev->in6;
1801 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1803 struct sockaddr_in sin;
1804 memset(&sin, 0, sizeof sin);
1805 sin.sin_family = AF_INET;
1806 sin.sin_addr = addr;
1809 memset(sa, 0, sizeof *sa);
1810 memcpy(sa, &sin, sizeof sin);
1814 do_set_addr(struct netdev *netdev,
1815 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1818 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1819 make_in4_sockaddr(&ifr.ifr_addr, addr);
1821 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1825 /* Adds 'router' as a default IP gateway. */
1827 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1829 struct in_addr any = { INADDR_ANY };
1833 memset(&rt, 0, sizeof rt);
1834 make_in4_sockaddr(&rt.rt_dst, any);
1835 make_in4_sockaddr(&rt.rt_gateway, router);
1836 make_in4_sockaddr(&rt.rt_genmask, any);
1837 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1838 COVERAGE_INC(netdev_add_router);
1839 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1841 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1847 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1850 static const char fn[] = "/proc/net/route";
1855 *netdev_name = NULL;
1856 stream = fopen(fn, "r");
1857 if (stream == NULL) {
1858 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1863 while (fgets(line, sizeof line, stream)) {
1866 uint32_t dest, gateway, mask;
1867 int refcnt, metric, mtu;
1868 unsigned int flags, use, window, irtt;
1871 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1873 iface, &dest, &gateway, &flags, &refcnt,
1874 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1876 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1880 if (!(flags & RTF_UP)) {
1881 /* Skip routes that aren't up. */
1885 /* The output of 'dest', 'mask', and 'gateway' were given in
1886 * network byte order, so we don't need need any endian
1887 * conversions here. */
1888 if ((dest & mask) == (host->s_addr & mask)) {
1890 /* The host is directly reachable. */
1891 next_hop->s_addr = 0;
1893 /* To reach the host, we must go through a gateway. */
1894 next_hop->s_addr = gateway;
1896 *netdev_name = xstrdup(iface);
1907 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1908 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1909 * returns 0. Otherwise, it returns a positive errno value; in particular,
1910 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1912 netdev_linux_arp_lookup(const struct netdev *netdev,
1913 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1916 struct sockaddr_in sin;
1919 memset(&r, 0, sizeof r);
1920 sin.sin_family = AF_INET;
1921 sin.sin_addr.s_addr = ip;
1923 memcpy(&r.arp_pa, &sin, sizeof sin);
1924 r.arp_ha.sa_family = ARPHRD_ETHER;
1926 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1927 COVERAGE_INC(netdev_arp_lookup);
1928 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1930 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1931 } else if (retval != ENXIO) {
1932 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1933 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1939 nd_to_iff_flags(enum netdev_flags nd)
1942 if (nd & NETDEV_UP) {
1945 if (nd & NETDEV_PROMISC) {
1952 iff_to_nd_flags(int iff)
1954 enum netdev_flags nd = 0;
1958 if (iff & IFF_PROMISC) {
1959 nd |= NETDEV_PROMISC;
1965 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
1966 enum netdev_flags on, enum netdev_flags *old_flagsp)
1968 int old_flags, new_flags;
1971 error = get_flags(netdev, &old_flags);
1973 *old_flagsp = iff_to_nd_flags(old_flags);
1974 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
1975 if (new_flags != old_flags) {
1976 error = set_flags(netdev, new_flags);
1983 poll_notify(struct list *list)
1985 struct netdev_linux_notifier *notifier;
1986 LIST_FOR_EACH (notifier, struct netdev_linux_notifier, node, list) {
1987 struct netdev_notifier *n = ¬ifier->notifier;
1993 netdev_linux_poll_cb(const struct rtnetlink_change *change,
1994 void *aux OVS_UNUSED)
1997 struct list *list = shash_find_data(&netdev_linux_notifiers,
2003 struct shash_node *node;
2004 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2005 poll_notify(node->data);
2011 netdev_linux_poll_add(struct netdev *netdev,
2012 void (*cb)(struct netdev_notifier *), void *aux,
2013 struct netdev_notifier **notifierp)
2015 const char *netdev_name = netdev_get_name(netdev);
2016 struct netdev_linux_notifier *notifier;
2019 if (shash_is_empty(&netdev_linux_notifiers)) {
2020 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2021 netdev_linux_poll_cb, NULL);
2027 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2029 list = xmalloc(sizeof *list);
2031 shash_add(&netdev_linux_notifiers, netdev_name, list);
2034 notifier = xmalloc(sizeof *notifier);
2035 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2036 list_push_back(list, ¬ifier->node);
2037 *notifierp = ¬ifier->notifier;
2042 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2044 struct netdev_linux_notifier *notifier =
2045 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2048 /* Remove 'notifier' from its list. */
2049 list = list_remove(¬ifier->node);
2050 if (list_is_empty(list)) {
2051 /* The list is now empty. Remove it from the hash and free it. */
2052 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2053 shash_delete(&netdev_linux_notifiers,
2054 shash_find(&netdev_linux_notifiers, netdev_name));
2059 /* If that was the last notifier, unregister. */
2060 if (shash_is_empty(&netdev_linux_notifiers)) {
2061 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2065 const struct netdev_class netdev_linux_class = {
2072 netdev_linux_create_system,
2073 netdev_linux_destroy,
2074 NULL, /* reconfigure */
2079 netdev_linux_enumerate,
2082 netdev_linux_recv_wait,
2086 netdev_linux_send_wait,
2088 netdev_linux_set_etheraddr,
2089 netdev_linux_get_etheraddr,
2090 netdev_linux_get_mtu,
2091 netdev_linux_get_ifindex,
2092 netdev_linux_get_carrier,
2093 netdev_linux_get_stats,
2094 netdev_vport_set_stats,
2096 netdev_linux_get_features,
2097 netdev_linux_set_advertisements,
2098 netdev_linux_get_vlan_vid,
2100 netdev_linux_set_policing,
2101 netdev_linux_get_qos_types,
2102 netdev_linux_get_qos_capabilities,
2103 netdev_linux_get_qos,
2104 netdev_linux_set_qos,
2105 netdev_linux_get_queue,
2106 netdev_linux_set_queue,
2107 netdev_linux_delete_queue,
2108 netdev_linux_get_queue_stats,
2109 netdev_linux_dump_queues,
2110 netdev_linux_dump_queue_stats,
2112 netdev_linux_get_in4,
2113 netdev_linux_set_in4,
2114 netdev_linux_get_in6,
2115 netdev_linux_add_router,
2116 netdev_linux_get_next_hop,
2117 netdev_linux_arp_lookup,
2119 netdev_linux_update_flags,
2121 netdev_linux_poll_add,
2122 netdev_linux_poll_remove,
2125 const struct netdev_class netdev_tap_class = {
2132 netdev_linux_create_tap,
2133 netdev_linux_destroy,
2134 NULL, /* reconfigure */
2139 NULL, /* enumerate */
2142 netdev_linux_recv_wait,
2146 netdev_linux_send_wait,
2148 netdev_linux_set_etheraddr,
2149 netdev_linux_get_etheraddr,
2150 netdev_linux_get_mtu,
2151 netdev_linux_get_ifindex,
2152 netdev_linux_get_carrier,
2153 netdev_linux_get_stats,
2154 NULL, /* set_stats */
2156 netdev_linux_get_features,
2157 netdev_linux_set_advertisements,
2158 netdev_linux_get_vlan_vid,
2160 netdev_linux_set_policing,
2161 netdev_linux_get_qos_types,
2162 netdev_linux_get_qos_capabilities,
2163 netdev_linux_get_qos,
2164 netdev_linux_set_qos,
2165 netdev_linux_get_queue,
2166 netdev_linux_set_queue,
2167 netdev_linux_delete_queue,
2168 netdev_linux_get_queue_stats,
2169 netdev_linux_dump_queues,
2170 netdev_linux_dump_queue_stats,
2172 netdev_linux_get_in4,
2173 netdev_linux_set_in4,
2174 netdev_linux_get_in6,
2175 netdev_linux_add_router,
2176 netdev_linux_get_next_hop,
2177 netdev_linux_arp_lookup,
2179 netdev_linux_update_flags,
2181 netdev_linux_poll_add,
2182 netdev_linux_poll_remove,
2185 /* HTB traffic control class. */
2187 #define HTB_N_QUEUES 0xf000
2191 unsigned int max_rate; /* In bytes/s. */
2195 unsigned int min_rate; /* In bytes/s. */
2196 unsigned int max_rate; /* In bytes/s. */
2197 unsigned int burst; /* In bytes. */
2198 unsigned int priority; /* Lower values are higher priorities. */
2202 htb_get__(const struct netdev *netdev)
2204 struct netdev_dev_linux *netdev_dev =
2205 netdev_dev_linux_cast(netdev_get_dev(netdev));
2206 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2210 htb_install__(struct netdev *netdev, uint64_t max_rate)
2212 struct netdev_dev_linux *netdev_dev =
2213 netdev_dev_linux_cast(netdev_get_dev(netdev));
2216 htb = xmalloc(sizeof *htb);
2217 tc_init(&htb->tc, &tc_ops_htb);
2218 htb->max_rate = max_rate;
2220 netdev_dev->tc = &htb->tc;
2225 /* Create an HTB qdisc.
2227 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default
2230 htb_setup_qdisc__(struct netdev *netdev)
2233 struct tc_htb_glob opt;
2234 struct ofpbuf request;
2235 struct tcmsg *tcmsg;
2237 tc_del_qdisc(netdev);
2239 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2240 NLM_F_EXCL | NLM_F_CREATE, &request);
2241 tcmsg->tcm_handle = tc_make_handle(1, 0);
2242 tcmsg->tcm_parent = TC_H_ROOT;
2244 nl_msg_put_string(&request, TCA_KIND, "htb");
2246 memset(&opt, 0, sizeof opt);
2247 opt.rate2quantum = 10;
2251 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2252 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2253 nl_msg_end_nested(&request, opt_offset);
2255 return tc_transact(&request, NULL);
2258 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2259 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2261 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2262 unsigned int parent, struct htb_class *class)
2265 struct tc_htb_opt opt;
2266 struct ofpbuf request;
2267 struct tcmsg *tcmsg;
2271 netdev_get_mtu(netdev, &mtu);
2273 memset(&opt, 0, sizeof opt);
2274 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2275 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2276 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2277 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2278 opt.prio = class->priority;
2280 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2281 tcmsg->tcm_handle = handle;
2282 tcmsg->tcm_parent = parent;
2284 nl_msg_put_string(&request, TCA_KIND, "htb");
2285 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2286 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2287 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2288 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2289 nl_msg_end_nested(&request, opt_offset);
2291 error = tc_transact(&request, NULL);
2293 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2294 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2295 netdev_get_name(netdev),
2296 tc_get_major(handle), tc_get_minor(handle),
2297 tc_get_major(parent), tc_get_minor(parent),
2298 class->min_rate, class->max_rate,
2299 class->burst, class->priority, strerror(error));
2304 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2305 * description of them into 'details'. The description complies with the
2306 * specification given in the vswitch database documentation for linux-htb
2309 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2311 static const struct nl_policy tca_htb_policy[] = {
2312 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2313 .min_len = sizeof(struct tc_htb_opt) },
2316 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2317 const struct tc_htb_opt *htb;
2319 if (!nl_parse_nested(nl_options, tca_htb_policy,
2320 attrs, ARRAY_SIZE(tca_htb_policy))) {
2321 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2325 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2326 class->min_rate = htb->rate.rate;
2327 class->max_rate = htb->ceil.rate;
2328 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2329 class->priority = htb->prio;
2334 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2335 struct htb_class *options,
2336 struct netdev_queue_stats *stats)
2338 struct nlattr *nl_options;
2339 unsigned int handle;
2342 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2343 if (!error && queue_id) {
2344 unsigned int major = tc_get_major(handle);
2345 unsigned int minor = tc_get_minor(handle);
2346 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2347 *queue_id = minor - 1;
2352 if (!error && options) {
2353 error = htb_parse_tca_options__(nl_options, options);
2359 htb_parse_qdisc_details__(struct netdev *netdev,
2360 const struct shash *details, struct htb_class *hc)
2362 const char *max_rate_s;
2364 max_rate_s = shash_find_data(details, "max-rate");
2365 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2366 if (!hc->max_rate) {
2369 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2370 hc->max_rate = netdev_features_to_bps(current) / 8;
2372 hc->min_rate = hc->max_rate;
2378 htb_parse_class_details__(struct netdev *netdev,
2379 const struct shash *details, struct htb_class *hc)
2381 const struct htb *htb = htb_get__(netdev);
2382 const char *min_rate_s = shash_find_data(details, "min-rate");
2383 const char *max_rate_s = shash_find_data(details, "max-rate");
2384 const char *burst_s = shash_find_data(details, "burst");
2385 const char *priority_s = shash_find_data(details, "priority");
2390 /* min-rate is required. */
2393 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2394 hc->min_rate = MAX(hc->min_rate, 0);
2395 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2398 hc->max_rate = (max_rate_s
2399 ? strtoull(max_rate_s, NULL, 10) / 8
2401 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2402 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2406 * According to hints in the documentation that I've read, it is important
2407 * that 'burst' be at least as big as the largest frame that might be
2408 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2409 * but having it a bit too small is a problem. Since netdev_get_mtu()
2410 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2411 * the MTU. We actually add 64, instead of 14, as a guard against
2412 * additional headers get tacked on somewhere that we're not aware of. */
2413 netdev_get_mtu(netdev, &mtu);
2414 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2415 hc->burst = MAX(hc->burst, mtu + 64);
2418 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2424 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2425 unsigned int parent, struct htb_class *options,
2426 struct netdev_queue_stats *stats)
2428 struct ofpbuf *reply;
2431 error = tc_query_class(netdev, handle, parent, &reply);
2433 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2434 ofpbuf_delete(reply);
2440 htb_tc_install(struct netdev *netdev, const struct shash *details)
2444 error = htb_setup_qdisc__(netdev);
2446 struct htb_class hc;
2448 htb_parse_qdisc_details__(netdev, details, &hc);
2449 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2450 tc_make_handle(1, 0), &hc);
2452 htb_install__(netdev, hc.max_rate);
2459 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2460 const struct htb_class *hc)
2462 struct htb *htb = htb_get__(netdev);
2463 struct htb_class *hcp;
2465 hcp = port_array_get(&htb->tc.queues, queue_id);
2467 hcp = xmalloc(sizeof *hcp);
2468 port_array_set(&htb->tc.queues, queue_id, hcp);
2474 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2476 struct shash details = SHASH_INITIALIZER(&details);
2478 struct nl_dump dump;
2479 struct htb_class hc;
2482 /* Get qdisc options. */
2484 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2485 htb = htb_install__(netdev, hc.max_rate);
2488 start_queue_dump(netdev, &dump);
2489 shash_init(&details);
2490 while (nl_dump_next(&dump, &msg)) {
2491 unsigned int queue_id;
2493 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2494 htb_update_queue__(netdev, queue_id, &hc);
2497 nl_dump_done(&dump);
2503 htb_tc_destroy(struct tc *tc)
2505 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2506 unsigned int queue_id;
2507 struct htb_class *hc;
2509 PORT_ARRAY_FOR_EACH (hc, &htb->tc.queues, queue_id) {
2517 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2519 const struct htb *htb = htb_get__(netdev);
2520 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2525 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2527 struct htb_class hc;
2530 htb_parse_qdisc_details__(netdev, details, &hc);
2531 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2532 tc_make_handle(1, 0), &hc);
2534 htb_get__(netdev)->max_rate = hc.max_rate;
2540 htb_class_get(const struct netdev *netdev, unsigned int queue_id,
2541 struct shash *details)
2543 const struct htb *htb = htb_get__(netdev);
2544 const struct htb_class *hc;
2546 hc = port_array_get(&htb->tc.queues, queue_id);
2549 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2550 if (hc->min_rate != hc->max_rate) {
2551 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2553 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2555 shash_add(details, "priority", xasprintf("%u", hc->priority));
2561 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2562 const struct shash *details)
2564 struct htb_class hc;
2567 error = htb_parse_class_details__(netdev, details, &hc);
2572 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2573 tc_make_handle(1, 0xfffe), &hc);
2578 htb_update_queue__(netdev, queue_id, &hc);
2583 htb_class_delete(struct netdev *netdev, unsigned int queue_id)
2585 struct htb *htb = htb_get__(netdev);
2586 struct htb_class *hc;
2589 hc = port_array_get(&htb->tc.queues, queue_id);
2592 error = tc_delete_class(netdev, tc_make_handle(1, queue_id + 1));
2595 port_array_delete(&htb->tc.queues, queue_id);
2601 htb_class_get_stats(const struct netdev *netdev, unsigned int queue_id,
2602 struct netdev_queue_stats *stats)
2604 return htb_query_class__(netdev, tc_make_handle(1, queue_id + 1),
2605 tc_make_handle(1, 0xfffe), NULL, stats);
2609 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2610 const struct ofpbuf *nlmsg,
2611 netdev_dump_queue_stats_cb *cb, void *aux)
2613 struct netdev_queue_stats stats;
2614 unsigned int handle, major, minor;
2617 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2622 major = tc_get_major(handle);
2623 minor = tc_get_minor(handle);
2624 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2625 (*cb)(tc_get_minor(handle), &stats, aux);
2630 static const struct tc_ops tc_ops_htb = {
2631 "htb", /* linux_name */
2632 "linux-htb", /* ovs_name */
2633 HTB_N_QUEUES, /* n_queues */
2642 htb_class_get_stats,
2643 htb_class_dump_stats
2646 /* "linux-default" traffic control class.
2648 * This class represents the default, unnamed Linux qdisc. It corresponds to
2649 * the "" (empty string) QoS type in the OVS database. */
2652 default_install__(struct netdev *netdev)
2654 struct netdev_dev_linux *netdev_dev =
2655 netdev_dev_linux_cast(netdev_get_dev(netdev));
2656 static struct tc *tc;
2659 tc = xmalloc(sizeof *tc);
2660 tc_init(tc, &tc_ops_default);
2662 netdev_dev->tc = tc;
2666 default_tc_install(struct netdev *netdev,
2667 const struct shash *details OVS_UNUSED)
2669 default_install__(netdev);
2674 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2676 default_install__(netdev);
2680 static const struct tc_ops tc_ops_default = {
2681 NULL, /* linux_name */
2686 NULL, /* tc_destroy */
2687 NULL, /* qdisc_get */
2688 NULL, /* qdisc_set */
2689 NULL, /* class_get */
2690 NULL, /* class_set */
2691 NULL, /* class_delete */
2692 NULL, /* class_get_stats */
2693 NULL /* class_dump_stats */
2696 /* "linux-other" traffic control class.
2701 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2703 struct netdev_dev_linux *netdev_dev =
2704 netdev_dev_linux_cast(netdev_get_dev(netdev));
2705 static struct tc *tc;
2708 tc = xmalloc(sizeof *tc);
2709 tc_init(tc, &tc_ops_other);
2711 netdev_dev->tc = tc;
2715 static const struct tc_ops tc_ops_other = {
2716 NULL, /* linux_name */
2717 "linux-other", /* ovs_name */
2719 NULL, /* tc_install */
2721 NULL, /* tc_destroy */
2722 NULL, /* qdisc_get */
2723 NULL, /* qdisc_set */
2724 NULL, /* class_get */
2725 NULL, /* class_set */
2726 NULL, /* class_delete */
2727 NULL, /* class_get_stats */
2728 NULL /* class_dump_stats */
2731 /* Traffic control. */
2733 /* Number of kernel "tc" ticks per second. */
2734 static double ticks_per_s;
2736 /* Number of kernel "jiffies" per second. This is used for the purpose of
2737 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
2738 * one jiffy's worth of data.
2740 * There are two possibilities here:
2742 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
2743 * approximate range of 100 to 1024. That means that we really need to
2744 * make sure that the qdisc can buffer that much data.
2746 * - 'buffer_hz' is an absurdly large number. That means that the kernel
2747 * has finely granular timers and there's no need to fudge additional room
2748 * for buffers. (There's no extra effort needed to implement that: the
2749 * large 'buffer_hz' is used as a divisor, so practically any number will
2750 * come out as 0 in the division. Small integer results in the case of
2751 * really high dividends won't have any real effect anyhow.)
2753 static unsigned int buffer_hz;
2755 /* Returns tc handle 'major':'minor'. */
2757 tc_make_handle(unsigned int major, unsigned int minor)
2759 return TC_H_MAKE(major << 16, minor);
2762 /* Returns the major number from 'handle'. */
2764 tc_get_major(unsigned int handle)
2766 return TC_H_MAJ(handle) >> 16;
2769 /* Returns the minor number from 'handle'. */
2771 tc_get_minor(unsigned int handle)
2773 return TC_H_MIN(handle);
2776 static struct tcmsg *
2777 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
2778 struct ofpbuf *request)
2780 struct tcmsg *tcmsg;
2784 error = get_ifindex(netdev, &ifindex);
2789 ofpbuf_init(request, 512);
2790 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
2791 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
2792 tcmsg->tcm_family = AF_UNSPEC;
2793 tcmsg->tcm_ifindex = ifindex;
2794 /* Caller should fill in tcmsg->tcm_handle. */
2795 /* Caller should fill in tcmsg->tcm_parent. */
2801 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
2803 int error = nl_sock_transact(rtnl_sock, request, replyp);
2804 ofpbuf_uninit(request);
2811 /* The values in psched are not individually very meaningful, but they are
2812 * important. The tables below show some values seen in the wild.
2816 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
2817 * (Before that, there are hints that it was 1000000000.)
2819 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
2823 * -----------------------------------
2824 * [1] 000c8000 000f4240 000f4240 00000064
2825 * [2] 000003e8 00000400 000f4240 3b9aca00
2826 * [3] 000003e8 00000400 000f4240 3b9aca00
2827 * [4] 000003e8 00000400 000f4240 00000064
2828 * [5] 000003e8 00000040 000f4240 3b9aca00
2829 * [6] 000003e8 00000040 000f4240 000000f9
2831 * a b c d ticks_per_s buffer_hz
2832 * ------- --------- ---------- ------------- ----------- -------------
2833 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
2834 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2835 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
2836 * [4] 1,000 1,024 1,000,000 100 976,562 100
2837 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
2838 * [6] 1,000 64 1,000,000 249 15,625,000 249
2840 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
2841 * [2] 2.6.26-1-686-bigmem from Debian lenny
2842 * [3] 2.6.26-2-sparc64 from Debian lenny
2843 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
2844 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
2845 * [6] 2.6.34 from kernel.org on KVM
2847 static const char fn[] = "/proc/net/psched";
2848 unsigned int a, b, c, d;
2854 stream = fopen(fn, "r");
2856 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
2860 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
2861 VLOG_WARN("%s: read failed", fn);
2865 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
2869 VLOG_WARN("%s: invalid scheduler parameters", fn);
2873 ticks_per_s = (double) a * c / b;
2877 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
2880 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
2883 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
2884 * rate of 'rate' bytes per second. */
2886 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
2891 return (rate * ticks) / ticks_per_s;
2894 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
2895 * rate of 'rate' bytes per second. */
2897 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
2902 return ((unsigned long long int) ticks_per_s * size) / rate;
2905 /* Returns the number of bytes that need to be reserved for qdisc buffering at
2906 * a transmission rate of 'rate' bytes per second. */
2908 tc_buffer_per_jiffy(unsigned int rate)
2913 return rate / buffer_hz;
2916 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
2917 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
2918 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
2919 * stores NULL into it if it is absent.
2921 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
2924 * Returns 0 if successful, otherwise a positive errno value. */
2926 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
2927 struct nlattr **options)
2929 static const struct nl_policy tca_policy[] = {
2930 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
2931 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
2933 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2935 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2936 tca_policy, ta, ARRAY_SIZE(ta))) {
2937 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
2942 *kind = nl_attr_get_string(ta[TCA_KIND]);
2946 *options = ta[TCA_OPTIONS];
2961 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
2962 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
2963 * into '*options', and its queue statistics into '*stats'. Any of the output
2964 * arguments may be null.
2966 * Returns 0 if successful, otherwise a positive errno value. */
2968 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
2969 struct nlattr **options, struct netdev_queue_stats *stats)
2971 static const struct nl_policy tca_policy[] = {
2972 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
2973 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
2975 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
2977 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
2978 tca_policy, ta, ARRAY_SIZE(ta))) {
2979 VLOG_WARN_RL(&rl, "failed to parse class message");
2984 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
2985 *handlep = tc->tcm_handle;
2989 *options = ta[TCA_OPTIONS];
2993 const struct gnet_stats_queue *gsq;
2994 struct gnet_stats_basic gsb;
2996 static const struct nl_policy stats_policy[] = {
2997 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
2998 .min_len = sizeof gsb },
2999 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3000 .min_len = sizeof *gsq },
3002 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3004 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3005 sa, ARRAY_SIZE(sa))) {
3006 VLOG_WARN_RL(&rl, "failed to parse class stats");
3010 /* Alignment issues screw up the length of struct gnet_stats_basic on
3011 * some arch/bitsize combinations. Newer versions of Linux have a
3012 * struct gnet_stats_basic_packed, but we can't depend on that. The
3013 * easiest thing to do is just to make a copy. */
3014 memset(&gsb, 0, sizeof gsb);
3015 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3016 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3017 stats->tx_bytes = gsb.bytes;
3018 stats->tx_packets = gsb.packets;
3020 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3021 stats->tx_errors = gsq->drops;
3031 memset(stats, 0, sizeof *stats);
3036 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3039 tc_query_class(const struct netdev *netdev,
3040 unsigned int handle, unsigned int parent,
3041 struct ofpbuf **replyp)
3043 struct ofpbuf request;
3044 struct tcmsg *tcmsg;
3047 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3048 tcmsg->tcm_handle = handle;
3049 tcmsg->tcm_parent = parent;
3051 error = tc_transact(&request, replyp);
3053 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3054 netdev_get_name(netdev),
3055 tc_get_major(handle), tc_get_minor(handle),
3056 tc_get_major(parent), tc_get_minor(parent),
3062 /* Equivalent to "tc class del dev <name> handle <handle>". */
3064 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3066 struct ofpbuf request;
3067 struct tcmsg *tcmsg;
3070 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3071 tcmsg->tcm_handle = handle;
3072 tcmsg->tcm_parent = 0;
3074 error = tc_transact(&request, NULL);
3076 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3077 netdev_get_name(netdev),
3078 tc_get_major(handle), tc_get_minor(handle),
3084 /* Equivalent to "tc qdisc del dev <name> root". */
3086 tc_del_qdisc(struct netdev *netdev)
3088 struct netdev_dev_linux *netdev_dev =
3089 netdev_dev_linux_cast(netdev_get_dev(netdev));
3090 struct ofpbuf request;
3091 struct tcmsg *tcmsg;
3094 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3095 tcmsg->tcm_handle = tc_make_handle(1, 0);
3096 tcmsg->tcm_parent = TC_H_ROOT;
3098 error = tc_transact(&request, NULL);
3099 if (error == EINVAL) {
3100 /* EINVAL probably means that the default qdisc was in use, in which
3101 * case we've accomplished our purpose. */
3104 if (!error && netdev_dev->tc) {
3105 if (netdev_dev->tc->ops->tc_destroy) {
3106 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3108 netdev_dev->tc = NULL;
3113 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3114 * kernel to determine what they are. Returns 0 if successful, otherwise a
3115 * positive errno value. */
3117 tc_query_qdisc(const struct netdev *netdev)
3119 struct netdev_dev_linux *netdev_dev =
3120 netdev_dev_linux_cast(netdev_get_dev(netdev));
3121 struct ofpbuf request, *qdisc;
3122 const struct tc_ops *ops;
3123 struct tcmsg *tcmsg;
3127 if (netdev_dev->tc) {
3131 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3132 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3133 * 2.6.35 without that fix backported to it.
3135 * To avoid the OOPS, we must not make a request that would attempt to dump
3136 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3137 * few others. There are a few ways that I can see to do this, but most of
3138 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3139 * technique chosen here is to assume that any non-default qdisc that we
3140 * create will have a class with handle 1:0. The built-in qdiscs only have
3141 * a class with handle 0:0.
3143 * We could check for Linux 2.6.35+ and use a more straightforward method
3145 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3146 tcmsg->tcm_handle = tc_make_handle(1, 0);
3147 tcmsg->tcm_parent = 0;
3149 /* Figure out what tc class to instantiate. */
3150 error = tc_transact(&request, &qdisc);
3154 error = tc_parse_qdisc(qdisc, &kind, NULL);
3156 ops = &tc_ops_other;
3158 ops = tc_lookup_linux_name(kind);
3160 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3161 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3163 ops = &tc_ops_other;
3166 } else if (error == ENOENT) {
3167 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3168 * other entity that doesn't have a handle 1:0. We will assume
3169 * that it's the system default qdisc. */
3170 ops = &tc_ops_default;
3173 /* Who knows? Maybe the device got deleted. */
3174 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3175 netdev_get_name(netdev), strerror(error));
3176 ops = &tc_ops_other;
3179 /* Instantiate it. */
3180 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3181 assert((load_error == 0) == (netdev_dev->tc != NULL));
3182 ofpbuf_delete(qdisc);
3184 return error ? error : load_error;
3187 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3188 approximate the time to transmit packets of various lengths. For an MTU of
3189 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3190 represents two possible packet lengths; for a MTU of 513 through 1024, four
3191 possible lengths; and so on.
3193 Returns, for the specified 'mtu', the number of bits that packet lengths
3194 need to be shifted right to fit within such a 256-entry table. */
3196 tc_calc_cell_log(unsigned int mtu)
3201 mtu = ETH_PAYLOAD_MAX;
3203 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3205 for (cell_log = 0; mtu >= 256; cell_log++) {
3212 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3215 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3217 memset(rate, 0, sizeof *rate);
3218 rate->cell_log = tc_calc_cell_log(mtu);
3219 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3220 /* rate->cell_align = 0; */ /* distro headers. */
3221 rate->mpu = ETH_TOTAL_MIN;
3225 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3226 * attribute of the specified "type".
3228 * See tc_calc_cell_log() above for a description of "rtab"s. */
3230 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3235 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3236 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3237 unsigned packet_size = (i + 1) << rate->cell_log;
3238 if (packet_size < rate->mpu) {
3239 packet_size = rate->mpu;
3241 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3245 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3246 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3247 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3252 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3254 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3255 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3259 /* Utility functions. */
3262 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3264 /* Policy for RTNLGRP_LINK messages.
3266 * There are *many* more fields in these messages, but currently we only
3267 * care about these fields. */
3268 static const struct nl_policy rtnlgrp_link_policy[] = {
3269 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3270 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3271 .min_len = sizeof(struct rtnl_link_stats) },
3274 struct ofpbuf request;
3275 struct ofpbuf *reply;
3276 struct ifinfomsg *ifi;
3277 const struct rtnl_link_stats *rtnl_stats;
3278 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3281 ofpbuf_init(&request, 0);
3282 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3283 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3284 ifi->ifi_family = PF_UNSPEC;
3285 ifi->ifi_index = ifindex;
3286 error = nl_sock_transact(rtnl_sock, &request, &reply);
3287 ofpbuf_uninit(&request);
3292 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3293 rtnlgrp_link_policy,
3294 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3295 ofpbuf_delete(reply);
3299 if (!attrs[IFLA_STATS]) {
3300 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3301 ofpbuf_delete(reply);
3305 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3306 stats->rx_packets = rtnl_stats->rx_packets;
3307 stats->tx_packets = rtnl_stats->tx_packets;
3308 stats->rx_bytes = rtnl_stats->rx_bytes;
3309 stats->tx_bytes = rtnl_stats->tx_bytes;
3310 stats->rx_errors = rtnl_stats->rx_errors;
3311 stats->tx_errors = rtnl_stats->tx_errors;
3312 stats->rx_dropped = rtnl_stats->rx_dropped;
3313 stats->tx_dropped = rtnl_stats->tx_dropped;
3314 stats->multicast = rtnl_stats->multicast;
3315 stats->collisions = rtnl_stats->collisions;
3316 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3317 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3318 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3319 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3320 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3321 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3322 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3323 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3324 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3325 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3326 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3328 ofpbuf_delete(reply);
3334 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3336 static const char fn[] = "/proc/net/dev";
3341 stream = fopen(fn, "r");
3343 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3348 while (fgets(line, sizeof line, stream)) {
3351 #define X64 "%"SCNu64
3354 X64 X64 X64 X64 X64 X64 X64 "%*u"
3355 X64 X64 X64 X64 X64 X64 X64 "%*u",
3361 &stats->rx_fifo_errors,
3362 &stats->rx_frame_errors,
3368 &stats->tx_fifo_errors,
3370 &stats->tx_carrier_errors) != 15) {
3371 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3372 } else if (!strcmp(devname, netdev_name)) {
3373 stats->rx_length_errors = UINT64_MAX;
3374 stats->rx_over_errors = UINT64_MAX;
3375 stats->rx_crc_errors = UINT64_MAX;
3376 stats->rx_missed_errors = UINT64_MAX;
3377 stats->tx_aborted_errors = UINT64_MAX;
3378 stats->tx_heartbeat_errors = UINT64_MAX;
3379 stats->tx_window_errors = UINT64_MAX;
3385 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3391 get_flags(const struct netdev *netdev, int *flags)
3396 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3398 *flags = ifr.ifr_flags;
3403 set_flags(struct netdev *netdev, int flags)
3407 ifr.ifr_flags = flags;
3408 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3413 do_get_ifindex(const char *netdev_name)
3417 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3418 COVERAGE_INC(netdev_get_ifindex);
3419 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3420 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3421 netdev_name, strerror(errno));
3424 return ifr.ifr_ifindex;
3428 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3430 struct netdev_dev_linux *netdev_dev =
3431 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3433 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3434 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3438 netdev_dev->cache_valid |= VALID_IFINDEX;
3439 netdev_dev->ifindex = ifindex;
3441 *ifindexp = netdev_dev->ifindex;
3446 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
3451 memset(&ifr, 0, sizeof ifr);
3452 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3453 COVERAGE_INC(netdev_get_hwaddr);
3454 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
3455 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
3456 netdev_name, strerror(errno));
3459 hwaddr_family = ifr.ifr_hwaddr.sa_family;
3460 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
3461 VLOG_WARN("%s device has unknown hardware address family %d",
3462 netdev_name, hwaddr_family);
3464 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
3469 set_etheraddr(const char *netdev_name, int hwaddr_family,
3470 const uint8_t mac[ETH_ADDR_LEN])
3474 memset(&ifr, 0, sizeof ifr);
3475 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3476 ifr.ifr_hwaddr.sa_family = hwaddr_family;
3477 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
3478 COVERAGE_INC(netdev_set_hwaddr);
3479 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
3480 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
3481 netdev_name, strerror(errno));
3488 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
3489 int cmd, const char *cmd_name)
3493 memset(&ifr, 0, sizeof ifr);
3494 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
3495 ifr.ifr_data = (caddr_t) ecmd;
3498 COVERAGE_INC(netdev_ethtool);
3499 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
3502 if (errno != EOPNOTSUPP) {
3503 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
3504 "failed: %s", cmd_name, name, strerror(errno));
3506 /* The device doesn't support this operation. That's pretty
3507 * common, so there's no point in logging anything. */
3514 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
3515 const char *cmd_name)
3517 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
3518 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
3519 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
3527 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
3528 int cmd, const char *cmd_name)
3533 ifr.ifr_addr.sa_family = AF_INET;
3534 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
3536 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
3537 *ip = sin->sin_addr;