2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
57 #include "openflow/openflow.h"
59 #include "poll-loop.h"
60 #include "rtnetlink.h"
61 #include "socket-util.h"
66 VLOG_DEFINE_THIS_MODULE(netdev_linux);
68 /* These were introduced in Linux 2.6.14, so they might be missing if we have
70 #ifndef ADVERTISED_Pause
71 #define ADVERTISED_Pause (1 << 13)
73 #ifndef ADVERTISED_Asym_Pause
74 #define ADVERTISED_Asym_Pause (1 << 14)
77 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
80 #define TC_RTAB_SIZE 1024
83 static struct rtnetlink_notifier netdev_linux_cache_notifier;
84 static int cache_notifier_refcount;
87 VALID_IFINDEX = 1 << 0,
88 VALID_ETHERADDR = 1 << 1,
92 VALID_CARRIER = 1 << 5,
93 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
94 VALID_POLICING = 1 << 7,
95 VALID_HAVE_VPORT_STATS = 1 << 8
103 /* Traffic control. */
105 /* An instance of a traffic control class. Always associated with a particular
108 * Each TC implementation subclasses this with whatever additional data it
111 const struct tc_ops *ops;
112 struct hmap queues; /* Contains "struct tc_queue"s.
113 * Read by generic TC layer.
114 * Written only by TC implementation. */
117 /* One traffic control queue.
119 * Each TC implementation subclasses this with whatever additional data it
122 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
123 unsigned int queue_id; /* OpenFlow queue ID. */
126 /* A particular kind of traffic control. Each implementation generally maps to
127 * one particular Linux qdisc class.
129 * The functions below return 0 if successful or a positive errno value on
130 * failure, except where otherwise noted. All of them must be provided, except
131 * where otherwise noted. */
133 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
134 * This is null for tc_ops_default and tc_ops_other, for which there are no
135 * appropriate values. */
136 const char *linux_name;
138 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
139 const char *ovs_name;
141 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
142 * queues. The queues are numbered 0 through n_queues - 1. */
143 unsigned int n_queues;
145 /* Called to install this TC class on 'netdev'. The implementation should
146 * make the Netlink calls required to set up 'netdev' with the right qdisc
147 * and configure it according to 'details'. The implementation may assume
148 * that the current qdisc is the default; that is, there is no need for it
149 * to delete the current qdisc before installing itself.
151 * The contents of 'details' should be documented as valid for 'ovs_name'
152 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
153 * (which is built as ovs-vswitchd.conf.db(8)).
155 * This function must return 0 if and only if it sets 'netdev->tc' to an
156 * initialized 'struct tc'.
158 * (This function is null for tc_ops_other, which cannot be installed. For
159 * other TC classes it should always be nonnull.) */
160 int (*tc_install)(struct netdev *netdev, const struct shash *details);
162 /* Called when the netdev code determines (through a Netlink query) that
163 * this TC class's qdisc is installed on 'netdev', but we didn't install
164 * it ourselves and so don't know any of the details.
166 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
167 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
168 * implementation should parse the other attributes of 'nlmsg' as
169 * necessary to determine its configuration. If necessary it should also
170 * use Netlink queries to determine the configuration of queues on
173 * This function must return 0 if and only if it sets 'netdev->tc' to an
174 * initialized 'struct tc'. */
175 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
177 /* Destroys the data structures allocated by the implementation as part of
178 * 'tc'. (This includes destroying 'tc->queues' by calling
181 * The implementation should not need to perform any Netlink calls. If
182 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
183 * (But it may not be desirable.)
185 * This function may be null if 'tc' is trivial. */
186 void (*tc_destroy)(struct tc *tc);
188 /* Retrieves details of 'netdev->tc' configuration into 'details'.
190 * The implementation should not need to perform any Netlink calls, because
191 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
192 * cached the configuration.
194 * The contents of 'details' should be documented as valid for 'ovs_name'
195 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
196 * (which is built as ovs-vswitchd.conf.db(8)).
198 * This function may be null if 'tc' is not configurable.
200 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
202 /* Reconfigures 'netdev->tc' according to 'details', performing any
203 * required Netlink calls to complete the reconfiguration.
205 * The contents of 'details' should be documented as valid for 'ovs_name'
206 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
207 * (which is built as ovs-vswitchd.conf.db(8)).
209 * This function may be null if 'tc' is not configurable.
211 int (*qdisc_set)(struct netdev *, const struct shash *details);
213 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
214 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "Queue" table in
218 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
220 * The implementation should not need to perform any Netlink calls, because
221 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
222 * cached the queue configuration.
224 * This function may be null if 'tc' does not have queues ('n_queues' is
226 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
227 struct shash *details);
229 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
230 * 'details', perfoming any required Netlink calls to complete the
231 * reconfiguration. The caller ensures that 'queue_id' is less than
234 * The contents of 'details' should be documented as valid for 'ovs_name'
235 * in the "other_config" column in the "Queue" table in
236 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
238 * This function may be null if 'tc' does not have queues or its queues are
239 * not configurable. */
240 int (*class_set)(struct netdev *, unsigned int queue_id,
241 const struct shash *details);
243 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
244 * tc_queue's within 'netdev->tc->queues'.
246 * This function may be null if 'tc' does not have queues or its queues
247 * cannot be deleted. */
248 int (*class_delete)(struct netdev *, struct tc_queue *queue);
250 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
251 * 'struct tc_queue's within 'netdev->tc->queues'.
253 * On success, initializes '*stats'.
255 * This function may be null if 'tc' does not have queues or if it cannot
256 * report queue statistics. */
257 int (*class_get_stats)(const struct netdev *netdev,
258 const struct tc_queue *queue,
259 struct netdev_queue_stats *stats);
261 /* Extracts queue stats from 'nlmsg', which is a response to a
262 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
264 * This function may be null if 'tc' does not have queues or if it cannot
265 * report queue statistics. */
266 int (*class_dump_stats)(const struct netdev *netdev,
267 const struct ofpbuf *nlmsg,
268 netdev_dump_queue_stats_cb *cb, void *aux);
272 tc_init(struct tc *tc, const struct tc_ops *ops)
275 hmap_init(&tc->queues);
279 tc_destroy(struct tc *tc)
281 hmap_destroy(&tc->queues);
284 static const struct tc_ops tc_ops_htb;
285 static const struct tc_ops tc_ops_hfsc;
286 static const struct tc_ops tc_ops_default;
287 static const struct tc_ops tc_ops_other;
289 static const struct tc_ops *tcs[] = {
290 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
291 &tc_ops_hfsc, /* Hierarchical fair service curve. */
292 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
293 &tc_ops_other, /* Some other qdisc. */
297 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
298 static unsigned int tc_get_major(unsigned int handle);
299 static unsigned int tc_get_minor(unsigned int handle);
301 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
302 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
303 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
305 static struct tcmsg *tc_make_request(const struct netdev *, int type,
306 unsigned int flags, struct ofpbuf *);
307 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
309 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
310 struct nlattr **options);
311 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
312 struct nlattr **options,
313 struct netdev_queue_stats *);
314 static int tc_query_class(const struct netdev *,
315 unsigned int handle, unsigned int parent,
316 struct ofpbuf **replyp);
317 static int tc_delete_class(const struct netdev *, unsigned int handle);
319 static int tc_del_qdisc(struct netdev *netdev);
320 static int tc_query_qdisc(const struct netdev *netdev);
322 static int tc_calc_cell_log(unsigned int mtu);
323 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
324 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
325 const struct tc_ratespec *rate);
326 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
328 struct netdev_dev_linux {
329 struct netdev_dev netdev_dev;
331 struct shash_node *shash_node;
332 unsigned int cache_valid;
334 /* The following are figured out "on demand" only. They are only valid
335 * when the corresponding VALID_* bit in 'cache_valid' is set. */
337 uint8_t etheraddr[ETH_ADDR_LEN];
338 struct in_addr address, netmask;
342 bool is_internal; /* Is this an openvswitch internal device? */
343 bool is_tap; /* Is this a tuntap device? */
344 uint32_t kbits_rate; /* Policing data. */
345 uint32_t kbits_burst;
346 bool have_vport_stats;
350 struct tap_state tap;
354 struct netdev_linux {
355 struct netdev netdev;
359 /* An AF_INET socket (used for ioctl operations). */
360 static int af_inet_sock = -1;
362 /* A Netlink routing socket that is not subscribed to any multicast groups. */
363 static struct nl_sock *rtnl_sock;
365 struct netdev_linux_notifier {
366 struct netdev_notifier notifier;
370 static struct shash netdev_linux_notifiers =
371 SHASH_INITIALIZER(&netdev_linux_notifiers);
372 static struct rtnetlink_notifier netdev_linux_poll_notifier;
374 /* This is set pretty low because we probably won't learn anything from the
375 * additional log messages. */
376 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
378 static int netdev_linux_init(void);
380 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
381 int cmd, const char *cmd_name);
382 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
383 const char *cmd_name);
384 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
385 int cmd, const char *cmd_name);
386 static int get_flags(const struct netdev *, int *flagsp);
387 static int set_flags(struct netdev *, int flags);
388 static int do_get_ifindex(const char *netdev_name);
389 static int get_ifindex(const struct netdev *, int *ifindexp);
390 static int do_set_addr(struct netdev *netdev,
391 int ioctl_nr, const char *ioctl_name,
392 struct in_addr addr);
393 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
394 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
395 const uint8_t[ETH_ADDR_LEN]);
396 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
397 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
400 is_netdev_linux_class(const struct netdev_class *netdev_class)
402 return netdev_class->init == netdev_linux_init;
405 static struct netdev_dev_linux *
406 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
408 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
409 assert(is_netdev_linux_class(netdev_class));
411 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
414 static struct netdev_linux *
415 netdev_linux_cast(const struct netdev *netdev)
417 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
418 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
419 assert(is_netdev_linux_class(netdev_class));
421 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
425 netdev_linux_init(void)
427 static int status = -1;
429 /* Create AF_INET socket. */
430 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
431 status = af_inet_sock >= 0 ? 0 : errno;
433 VLOG_ERR("failed to create inet socket: %s", strerror(status));
436 /* Create rtnetlink socket. */
438 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
440 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
449 netdev_linux_run(void)
451 rtnetlink_notifier_run();
455 netdev_linux_wait(void)
457 rtnetlink_notifier_wait();
461 netdev_linux_cache_cb(const struct rtnetlink_change *change,
462 void *aux OVS_UNUSED)
464 struct netdev_dev_linux *dev;
466 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
468 const struct netdev_class *netdev_class =
469 netdev_dev_get_class(base_dev);
471 if (is_netdev_linux_class(netdev_class)) {
472 dev = netdev_dev_linux_cast(base_dev);
473 dev->cache_valid = 0;
477 struct shash device_shash;
478 struct shash_node *node;
480 shash_init(&device_shash);
481 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
482 SHASH_FOR_EACH (node, &device_shash) {
484 dev->cache_valid = 0;
486 shash_destroy(&device_shash);
490 /* Creates the netdev device of 'type' with 'name'. */
492 netdev_linux_create_system(const struct netdev_class *class OVS_UNUSED,
493 const char *name, const struct shash *args,
494 struct netdev_dev **netdev_devp)
496 struct netdev_dev_linux *netdev_dev;
499 if (!shash_is_empty(args)) {
500 VLOG_WARN("%s: arguments for system devices should be empty", name);
503 if (!cache_notifier_refcount) {
504 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
505 netdev_linux_cache_cb, NULL);
510 cache_notifier_refcount++;
512 netdev_dev = xzalloc(sizeof *netdev_dev);
513 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_linux_class);
515 *netdev_devp = &netdev_dev->netdev_dev;
519 /* For most types of netdevs we open the device for each call of
520 * netdev_open(). However, this is not the case with tap devices,
521 * since it is only possible to open the device once. In this
522 * situation we share a single file descriptor, and consequently
523 * buffers, across all readers. Therefore once data is read it will
524 * be unavailable to other reads for tap devices. */
526 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
527 const char *name, const struct shash *args,
528 struct netdev_dev **netdev_devp)
530 struct netdev_dev_linux *netdev_dev;
531 struct tap_state *state;
532 static const char tap_dev[] = "/dev/net/tun";
536 if (!shash_is_empty(args)) {
537 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
540 netdev_dev = xzalloc(sizeof *netdev_dev);
541 state = &netdev_dev->state.tap;
543 /* Open tap device. */
544 state->fd = open(tap_dev, O_RDWR);
547 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
551 /* Create tap device. */
552 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
553 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
554 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
555 VLOG_WARN("%s: creating tap device failed: %s", name,
561 /* Make non-blocking. */
562 error = set_nonblocking(state->fd);
567 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
568 *netdev_devp = &netdev_dev->netdev_dev;
577 destroy_tap(struct netdev_dev_linux *netdev_dev)
579 struct tap_state *state = &netdev_dev->state.tap;
581 if (state->fd >= 0) {
586 /* Destroys the netdev device 'netdev_dev_'. */
588 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
590 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
591 const char *type = netdev_dev_get_type(netdev_dev_);
593 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
594 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
597 if (!strcmp(type, "system")) {
598 cache_notifier_refcount--;
600 if (!cache_notifier_refcount) {
601 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
603 } else if (!strcmp(type, "tap")) {
604 destroy_tap(netdev_dev);
611 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
612 struct netdev **netdevp)
614 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
615 struct netdev_linux *netdev;
616 enum netdev_flags flags;
619 /* Allocate network device. */
620 netdev = xzalloc(sizeof *netdev);
622 netdev_init(&netdev->netdev, netdev_dev_);
624 error = netdev_get_flags(&netdev->netdev, &flags);
625 if (error == ENODEV) {
629 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
630 !netdev_dev->state.tap.opened) {
632 /* We assume that the first user of the tap device is the primary user
633 * and give them the tap FD. Subsequent users probably just expect
634 * this to be a system device so open it normally to avoid send/receive
635 * directions appearing to be reversed. */
636 netdev->fd = netdev_dev->state.tap.fd;
637 netdev_dev->state.tap.opened = true;
638 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
639 struct sockaddr_ll sll;
643 /* Create file descriptor. */
644 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
645 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
647 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
648 if (netdev->fd < 0) {
653 /* Set non-blocking mode. */
654 error = set_nonblocking(netdev->fd);
659 /* Get ethernet device index. */
660 error = get_ifindex(&netdev->netdev, &ifindex);
665 /* Bind to specific ethernet device. */
666 memset(&sll, 0, sizeof sll);
667 sll.sll_family = AF_PACKET;
668 sll.sll_ifindex = ifindex;
670 (struct sockaddr *) &sll, sizeof sll) < 0) {
672 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
677 /* Between the socket() and bind() calls above, the socket receives all
678 * packets of the requested type on all system interfaces. We do not
679 * want to receive that data, but there is no way to avoid it. So we
680 * must now drain out the receive queue. */
681 error = drain_rcvbuf(netdev->fd);
687 *netdevp = &netdev->netdev;
691 netdev_uninit(&netdev->netdev, true);
695 /* Closes and destroys 'netdev'. */
697 netdev_linux_close(struct netdev *netdev_)
699 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
701 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
707 /* Initializes 'svec' with a list of the names of all known network devices. */
709 netdev_linux_enumerate(struct svec *svec)
711 struct if_nameindex *names;
713 names = if_nameindex();
717 for (i = 0; names[i].if_name != NULL; i++) {
718 svec_add(svec, names[i].if_name);
720 if_freenameindex(names);
723 VLOG_WARN("could not obtain list of network device names: %s",
730 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
732 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
734 if (netdev->fd < 0) {
735 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
740 ssize_t retval = read(netdev->fd, data, size);
743 } else if (errno != EINTR) {
744 if (errno != EAGAIN) {
745 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
746 strerror(errno), netdev_get_name(netdev_));
753 /* Registers with the poll loop to wake up from the next call to poll_block()
754 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
756 netdev_linux_recv_wait(struct netdev *netdev_)
758 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
759 if (netdev->fd >= 0) {
760 poll_fd_wait(netdev->fd, POLLIN);
764 /* Discards all packets waiting to be received from 'netdev'. */
766 netdev_linux_drain(struct netdev *netdev_)
768 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
769 if (netdev->fd < 0) {
771 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
773 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
774 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
778 drain_fd(netdev->fd, ifr.ifr_qlen);
781 return drain_rcvbuf(netdev->fd);
785 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
786 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
787 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
788 * the packet is too big or too small to transmit on the device.
790 * The caller retains ownership of 'buffer' in all cases.
792 * The kernel maintains a packet transmission queue, so the caller is not
793 * expected to do additional queuing of packets. */
795 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
797 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
801 if (netdev->fd < 0) {
806 ssize_t retval = write(netdev->fd, data, size);
808 /* The Linux AF_PACKET implementation never blocks waiting for room
809 * for packets, instead returning ENOBUFS. Translate this into
810 * EAGAIN for the caller. */
811 if (errno == ENOBUFS) {
813 } else if (errno == EINTR) {
815 } else if (errno != EAGAIN) {
816 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
817 netdev_get_name(netdev_), strerror(errno));
820 } else if (retval != size) {
821 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
822 "%zu) on %s", retval, size, netdev_get_name(netdev_));
830 /* Registers with the poll loop to wake up from the next call to poll_block()
831 * when the packet transmission queue has sufficient room to transmit a packet
832 * with netdev_send().
834 * The kernel maintains a packet transmission queue, so the client is not
835 * expected to do additional queuing of packets. Thus, this function is
836 * unlikely to ever be used. It is included for completeness. */
838 netdev_linux_send_wait(struct netdev *netdev_)
840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
841 if (netdev->fd < 0) {
843 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
844 poll_fd_wait(netdev->fd, POLLOUT);
846 /* TAP device always accepts packets.*/
847 poll_immediate_wake();
851 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
852 * otherwise a positive errno value. */
854 netdev_linux_set_etheraddr(struct netdev *netdev_,
855 const uint8_t mac[ETH_ADDR_LEN])
857 struct netdev_dev_linux *netdev_dev =
858 netdev_dev_linux_cast(netdev_get_dev(netdev_));
861 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
862 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
863 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
865 netdev_dev->cache_valid |= VALID_ETHERADDR;
866 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
874 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
875 * free the returned buffer. */
877 netdev_linux_get_etheraddr(const struct netdev *netdev_,
878 uint8_t mac[ETH_ADDR_LEN])
880 struct netdev_dev_linux *netdev_dev =
881 netdev_dev_linux_cast(netdev_get_dev(netdev_));
882 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
883 int error = get_etheraddr(netdev_get_name(netdev_),
884 netdev_dev->etheraddr);
888 netdev_dev->cache_valid |= VALID_ETHERADDR;
890 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
894 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
895 * in bytes, not including the hardware header; thus, this is typically 1500
896 * bytes for Ethernet devices. */
898 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
900 struct netdev_dev_linux *netdev_dev =
901 netdev_dev_linux_cast(netdev_get_dev(netdev_));
902 if (!(netdev_dev->cache_valid & VALID_MTU)) {
906 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
907 SIOCGIFMTU, "SIOCGIFMTU");
911 netdev_dev->mtu = ifr.ifr_mtu;
912 netdev_dev->cache_valid |= VALID_MTU;
914 *mtup = netdev_dev->mtu;
918 /* Returns the ifindex of 'netdev', if successful, as a positive number.
919 * On failure, returns a negative errno value. */
921 netdev_linux_get_ifindex(const struct netdev *netdev)
925 error = get_ifindex(netdev, &ifindex);
926 return error ? -error : ifindex;
930 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
932 struct netdev_dev_linux *netdev_dev =
933 netdev_dev_linux_cast(netdev_get_dev(netdev_));
938 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
942 fn = xasprintf("/sys/class/net/%s/carrier",
943 netdev_get_name(netdev_));
944 fd = open(fn, O_RDONLY);
947 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
951 retval = read(fd, line, sizeof line);
954 if (error == EINVAL) {
955 /* This is the normal return value when we try to check carrier
956 * if the network device is not up. */
958 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
961 } else if (retval == 0) {
963 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
967 if (line[0] != '0' && line[0] != '1') {
969 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
973 netdev_dev->carrier = line[0] != '0';
974 netdev_dev->cache_valid |= VALID_CARRIER;
976 *carrier = netdev_dev->carrier;
987 /* Check whether we can we use RTM_GETLINK to get network device statistics.
988 * In pre-2.6.19 kernels, this was only available if wireless extensions were
991 check_for_working_netlink_stats(void)
993 /* Decide on the netdev_get_stats() implementation to use. Netlink is
994 * preferable, so if that works, we'll use it. */
995 int ifindex = do_get_ifindex("lo");
997 VLOG_WARN("failed to get ifindex for lo, "
998 "obtaining netdev stats from proc");
1001 struct netdev_stats stats;
1002 int error = get_stats_via_netlink(ifindex, &stats);
1004 VLOG_DBG("obtaining netdev stats via rtnetlink");
1007 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1008 "via proc (you are probably running a pre-2.6.19 "
1009 "kernel)", strerror(error));
1015 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1017 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1019 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1020 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1021 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1023 netdev_dev->is_tap = !strcmp(type, "tap");
1024 netdev_dev->is_internal = false;
1025 if (!netdev_dev->is_tap) {
1026 struct ethtool_drvinfo drvinfo;
1029 memset(&drvinfo, 0, sizeof drvinfo);
1030 error = netdev_linux_do_ethtool(name,
1031 (struct ethtool_cmd *)&drvinfo,
1033 "ETHTOOL_GDRVINFO");
1035 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1036 netdev_dev->is_internal = true;
1040 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1045 swap_uint64(uint64_t *a, uint64_t *b)
1052 /* Retrieves current device stats for 'netdev'. */
1054 netdev_linux_get_stats(const struct netdev *netdev_,
1055 struct netdev_stats *stats)
1057 struct netdev_dev_linux *netdev_dev =
1058 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1059 static int use_netlink_stats = -1;
1062 COVERAGE_INC(netdev_get_stats);
1064 if (netdev_dev->have_vport_stats ||
1065 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1067 error = netdev_vport_get_stats(netdev_, stats);
1068 netdev_dev->have_vport_stats = !error;
1069 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1072 if (!netdev_dev->have_vport_stats) {
1073 if (use_netlink_stats < 0) {
1074 use_netlink_stats = check_for_working_netlink_stats();
1076 if (use_netlink_stats) {
1079 error = get_ifindex(netdev_, &ifindex);
1081 error = get_stats_via_netlink(ifindex, stats);
1084 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1088 /* If this port is an internal port then the transmit and receive stats
1089 * will appear to be swapped relative to the other ports since we are the
1090 * one sending the data, not a remote computer. For consistency, we swap
1091 * them back here. This does not apply if we are getting stats from the
1092 * vport layer because it always tracks stats from the perspective of the
1094 netdev_linux_update_is_pseudo(netdev_dev);
1095 if (!error && !netdev_dev->have_vport_stats &&
1096 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1097 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1098 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1099 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1100 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1101 stats->rx_length_errors = 0;
1102 stats->rx_over_errors = 0;
1103 stats->rx_crc_errors = 0;
1104 stats->rx_frame_errors = 0;
1105 stats->rx_fifo_errors = 0;
1106 stats->rx_missed_errors = 0;
1107 stats->tx_aborted_errors = 0;
1108 stats->tx_carrier_errors = 0;
1109 stats->tx_fifo_errors = 0;
1110 stats->tx_heartbeat_errors = 0;
1111 stats->tx_window_errors = 0;
1117 /* Stores the features supported by 'netdev' into each of '*current',
1118 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1119 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1120 * successful, otherwise a positive errno value. */
1122 netdev_linux_get_features(struct netdev *netdev,
1123 uint32_t *current, uint32_t *advertised,
1124 uint32_t *supported, uint32_t *peer)
1126 struct ethtool_cmd ecmd;
1129 memset(&ecmd, 0, sizeof ecmd);
1130 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1131 ETHTOOL_GSET, "ETHTOOL_GSET");
1136 /* Supported features. */
1138 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1139 *supported |= OFPPF_10MB_HD;
1141 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1142 *supported |= OFPPF_10MB_FD;
1144 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1145 *supported |= OFPPF_100MB_HD;
1147 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1148 *supported |= OFPPF_100MB_FD;
1150 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1151 *supported |= OFPPF_1GB_HD;
1153 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1154 *supported |= OFPPF_1GB_FD;
1156 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1157 *supported |= OFPPF_10GB_FD;
1159 if (ecmd.supported & SUPPORTED_TP) {
1160 *supported |= OFPPF_COPPER;
1162 if (ecmd.supported & SUPPORTED_FIBRE) {
1163 *supported |= OFPPF_FIBER;
1165 if (ecmd.supported & SUPPORTED_Autoneg) {
1166 *supported |= OFPPF_AUTONEG;
1168 if (ecmd.supported & SUPPORTED_Pause) {
1169 *supported |= OFPPF_PAUSE;
1171 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1172 *supported |= OFPPF_PAUSE_ASYM;
1175 /* Advertised features. */
1177 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1178 *advertised |= OFPPF_10MB_HD;
1180 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1181 *advertised |= OFPPF_10MB_FD;
1183 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1184 *advertised |= OFPPF_100MB_HD;
1186 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1187 *advertised |= OFPPF_100MB_FD;
1189 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1190 *advertised |= OFPPF_1GB_HD;
1192 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1193 *advertised |= OFPPF_1GB_FD;
1195 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1196 *advertised |= OFPPF_10GB_FD;
1198 if (ecmd.advertising & ADVERTISED_TP) {
1199 *advertised |= OFPPF_COPPER;
1201 if (ecmd.advertising & ADVERTISED_FIBRE) {
1202 *advertised |= OFPPF_FIBER;
1204 if (ecmd.advertising & ADVERTISED_Autoneg) {
1205 *advertised |= OFPPF_AUTONEG;
1207 if (ecmd.advertising & ADVERTISED_Pause) {
1208 *advertised |= OFPPF_PAUSE;
1210 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1211 *advertised |= OFPPF_PAUSE_ASYM;
1214 /* Current settings. */
1215 if (ecmd.speed == SPEED_10) {
1216 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1217 } else if (ecmd.speed == SPEED_100) {
1218 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1219 } else if (ecmd.speed == SPEED_1000) {
1220 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1221 } else if (ecmd.speed == SPEED_10000) {
1222 *current = OFPPF_10GB_FD;
1227 if (ecmd.port == PORT_TP) {
1228 *current |= OFPPF_COPPER;
1229 } else if (ecmd.port == PORT_FIBRE) {
1230 *current |= OFPPF_FIBER;
1234 *current |= OFPPF_AUTONEG;
1237 /* Peer advertisements. */
1238 *peer = 0; /* XXX */
1243 /* Set the features advertised by 'netdev' to 'advertise'. */
1245 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1247 struct ethtool_cmd ecmd;
1250 memset(&ecmd, 0, sizeof ecmd);
1251 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1252 ETHTOOL_GSET, "ETHTOOL_GSET");
1257 ecmd.advertising = 0;
1258 if (advertise & OFPPF_10MB_HD) {
1259 ecmd.advertising |= ADVERTISED_10baseT_Half;
1261 if (advertise & OFPPF_10MB_FD) {
1262 ecmd.advertising |= ADVERTISED_10baseT_Full;
1264 if (advertise & OFPPF_100MB_HD) {
1265 ecmd.advertising |= ADVERTISED_100baseT_Half;
1267 if (advertise & OFPPF_100MB_FD) {
1268 ecmd.advertising |= ADVERTISED_100baseT_Full;
1270 if (advertise & OFPPF_1GB_HD) {
1271 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1273 if (advertise & OFPPF_1GB_FD) {
1274 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1276 if (advertise & OFPPF_10GB_FD) {
1277 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1279 if (advertise & OFPPF_COPPER) {
1280 ecmd.advertising |= ADVERTISED_TP;
1282 if (advertise & OFPPF_FIBER) {
1283 ecmd.advertising |= ADVERTISED_FIBRE;
1285 if (advertise & OFPPF_AUTONEG) {
1286 ecmd.advertising |= ADVERTISED_Autoneg;
1288 if (advertise & OFPPF_PAUSE) {
1289 ecmd.advertising |= ADVERTISED_Pause;
1291 if (advertise & OFPPF_PAUSE_ASYM) {
1292 ecmd.advertising |= ADVERTISED_Asym_Pause;
1294 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1295 ETHTOOL_SSET, "ETHTOOL_SSET");
1298 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1299 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1300 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1301 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1302 * sets '*vlan_vid' to -1. */
1304 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1306 const char *netdev_name = netdev_get_name(netdev);
1307 struct ds line = DS_EMPTY_INITIALIZER;
1308 FILE *stream = NULL;
1312 COVERAGE_INC(netdev_get_vlan_vid);
1313 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1314 stream = fopen(fn, "r");
1320 if (ds_get_line(&line, stream)) {
1321 if (ferror(stream)) {
1323 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1326 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1331 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1333 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1334 fn, ds_cstr(&line));
1352 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1353 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1355 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1356 * positive errno value.
1358 * This function is equivalent to running
1359 * /sbin/tc qdisc del dev %s handle ffff: ingress
1360 * but it is much, much faster.
1363 netdev_linux_remove_policing(struct netdev *netdev)
1365 struct netdev_dev_linux *netdev_dev =
1366 netdev_dev_linux_cast(netdev_get_dev(netdev));
1367 const char *netdev_name = netdev_get_name(netdev);
1369 struct ofpbuf request;
1370 struct tcmsg *tcmsg;
1373 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1377 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1378 tcmsg->tcm_parent = TC_H_INGRESS;
1379 nl_msg_put_string(&request, TCA_KIND, "ingress");
1380 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1382 error = tc_transact(&request, NULL);
1383 if (error && error != ENOENT && error != EINVAL) {
1384 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1385 netdev_name, strerror(error));
1389 netdev_dev->kbits_rate = 0;
1390 netdev_dev->kbits_burst = 0;
1391 netdev_dev->cache_valid |= VALID_POLICING;
1395 /* Attempts to set input rate limiting (policing) policy. */
1397 netdev_linux_set_policing(struct netdev *netdev,
1398 uint32_t kbits_rate, uint32_t kbits_burst)
1400 struct netdev_dev_linux *netdev_dev =
1401 netdev_dev_linux_cast(netdev_get_dev(netdev));
1402 const char *netdev_name = netdev_get_name(netdev);
1405 COVERAGE_INC(netdev_set_policing);
1407 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1408 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1409 : kbits_burst); /* Stick with user-specified value. */
1411 if (netdev_dev->cache_valid & VALID_POLICING
1412 && netdev_dev->kbits_rate == kbits_rate
1413 && netdev_dev->kbits_burst == kbits_burst) {
1414 /* Assume that settings haven't changed since we last set them. */
1418 netdev_linux_remove_policing(netdev);
1420 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1421 if (system(command) != 0) {
1422 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1426 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1427 kbits_rate, kbits_burst);
1428 if (system(command) != 0) {
1429 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1434 netdev_dev->kbits_rate = kbits_rate;
1435 netdev_dev->kbits_burst = kbits_burst;
1436 netdev_dev->cache_valid |= VALID_POLICING;
1443 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1446 const struct tc_ops **opsp;
1448 for (opsp = tcs; *opsp != NULL; opsp++) {
1449 const struct tc_ops *ops = *opsp;
1450 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1451 svec_add(types, ops->ovs_name);
1457 static const struct tc_ops *
1458 tc_lookup_ovs_name(const char *name)
1460 const struct tc_ops **opsp;
1462 for (opsp = tcs; *opsp != NULL; opsp++) {
1463 const struct tc_ops *ops = *opsp;
1464 if (!strcmp(name, ops->ovs_name)) {
1471 static const struct tc_ops *
1472 tc_lookup_linux_name(const char *name)
1474 const struct tc_ops **opsp;
1476 for (opsp = tcs; *opsp != NULL; opsp++) {
1477 const struct tc_ops *ops = *opsp;
1478 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1485 static struct tc_queue *
1486 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1489 struct netdev_dev_linux *netdev_dev =
1490 netdev_dev_linux_cast(netdev_get_dev(netdev));
1491 struct tc_queue *queue;
1493 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1494 if (queue->queue_id == queue_id) {
1501 static struct tc_queue *
1502 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1504 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1508 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1510 struct netdev_qos_capabilities *caps)
1512 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1516 caps->n_queues = ops->n_queues;
1521 netdev_linux_get_qos(const struct netdev *netdev,
1522 const char **typep, struct shash *details)
1524 struct netdev_dev_linux *netdev_dev =
1525 netdev_dev_linux_cast(netdev_get_dev(netdev));
1528 error = tc_query_qdisc(netdev);
1533 *typep = netdev_dev->tc->ops->ovs_name;
1534 return (netdev_dev->tc->ops->qdisc_get
1535 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1540 netdev_linux_set_qos(struct netdev *netdev,
1541 const char *type, const struct shash *details)
1543 struct netdev_dev_linux *netdev_dev =
1544 netdev_dev_linux_cast(netdev_get_dev(netdev));
1545 const struct tc_ops *new_ops;
1548 new_ops = tc_lookup_ovs_name(type);
1549 if (!new_ops || !new_ops->tc_install) {
1553 error = tc_query_qdisc(netdev);
1558 if (new_ops == netdev_dev->tc->ops) {
1559 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1561 /* Delete existing qdisc. */
1562 error = tc_del_qdisc(netdev);
1566 assert(netdev_dev->tc == NULL);
1568 /* Install new qdisc. */
1569 error = new_ops->tc_install(netdev, details);
1570 assert((error == 0) == (netdev_dev->tc != NULL));
1577 netdev_linux_get_queue(const struct netdev *netdev,
1578 unsigned int queue_id, struct shash *details)
1580 struct netdev_dev_linux *netdev_dev =
1581 netdev_dev_linux_cast(netdev_get_dev(netdev));
1584 error = tc_query_qdisc(netdev);
1588 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1590 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1596 netdev_linux_set_queue(struct netdev *netdev,
1597 unsigned int queue_id, const struct shash *details)
1599 struct netdev_dev_linux *netdev_dev =
1600 netdev_dev_linux_cast(netdev_get_dev(netdev));
1603 error = tc_query_qdisc(netdev);
1606 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1607 || !netdev_dev->tc->ops->class_set) {
1611 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1615 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1617 struct netdev_dev_linux *netdev_dev =
1618 netdev_dev_linux_cast(netdev_get_dev(netdev));
1621 error = tc_query_qdisc(netdev);
1624 } else if (!netdev_dev->tc->ops->class_delete) {
1627 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1629 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1635 netdev_linux_get_queue_stats(const struct netdev *netdev,
1636 unsigned int queue_id,
1637 struct netdev_queue_stats *stats)
1639 struct netdev_dev_linux *netdev_dev =
1640 netdev_dev_linux_cast(netdev_get_dev(netdev));
1643 error = tc_query_qdisc(netdev);
1646 } else if (!netdev_dev->tc->ops->class_get_stats) {
1649 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1651 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1657 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1659 struct ofpbuf request;
1660 struct tcmsg *tcmsg;
1662 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1666 tcmsg->tcm_parent = 0;
1667 nl_dump_start(dump, rtnl_sock, &request);
1668 ofpbuf_uninit(&request);
1673 netdev_linux_dump_queues(const struct netdev *netdev,
1674 netdev_dump_queues_cb *cb, void *aux)
1676 struct netdev_dev_linux *netdev_dev =
1677 netdev_dev_linux_cast(netdev_get_dev(netdev));
1678 struct tc_queue *queue;
1679 struct shash details;
1683 error = tc_query_qdisc(netdev);
1686 } else if (!netdev_dev->tc->ops->class_get) {
1691 shash_init(&details);
1692 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1693 shash_clear(&details);
1695 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1697 (*cb)(queue->queue_id, &details, aux);
1702 shash_destroy(&details);
1708 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1709 netdev_dump_queue_stats_cb *cb, void *aux)
1711 struct netdev_dev_linux *netdev_dev =
1712 netdev_dev_linux_cast(netdev_get_dev(netdev));
1713 struct nl_dump dump;
1718 error = tc_query_qdisc(netdev);
1721 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1726 if (!start_queue_dump(netdev, &dump)) {
1729 while (nl_dump_next(&dump, &msg)) {
1730 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1736 error = nl_dump_done(&dump);
1737 return error ? error : last_error;
1741 netdev_linux_get_in4(const struct netdev *netdev_,
1742 struct in_addr *address, struct in_addr *netmask)
1744 struct netdev_dev_linux *netdev_dev =
1745 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1747 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1750 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1751 SIOCGIFADDR, "SIOCGIFADDR");
1756 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1757 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1762 netdev_dev->cache_valid |= VALID_IN4;
1764 *address = netdev_dev->address;
1765 *netmask = netdev_dev->netmask;
1766 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1770 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1771 struct in_addr netmask)
1773 struct netdev_dev_linux *netdev_dev =
1774 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1777 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1779 netdev_dev->cache_valid |= VALID_IN4;
1780 netdev_dev->address = address;
1781 netdev_dev->netmask = netmask;
1782 if (address.s_addr != INADDR_ANY) {
1783 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1784 "SIOCSIFNETMASK", netmask);
1791 parse_if_inet6_line(const char *line,
1792 struct in6_addr *in6, char ifname[16 + 1])
1794 uint8_t *s6 = in6->s6_addr;
1795 #define X8 "%2"SCNx8
1797 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1798 "%*x %*x %*x %*x %16s\n",
1799 &s6[0], &s6[1], &s6[2], &s6[3],
1800 &s6[4], &s6[5], &s6[6], &s6[7],
1801 &s6[8], &s6[9], &s6[10], &s6[11],
1802 &s6[12], &s6[13], &s6[14], &s6[15],
1806 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1807 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1809 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1811 struct netdev_dev_linux *netdev_dev =
1812 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1813 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1817 netdev_dev->in6 = in6addr_any;
1819 file = fopen("/proc/net/if_inet6", "r");
1821 const char *name = netdev_get_name(netdev_);
1822 while (fgets(line, sizeof line, file)) {
1823 struct in6_addr in6_tmp;
1824 char ifname[16 + 1];
1825 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1826 && !strcmp(name, ifname))
1828 netdev_dev->in6 = in6_tmp;
1834 netdev_dev->cache_valid |= VALID_IN6;
1836 *in6 = netdev_dev->in6;
1841 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1843 struct sockaddr_in sin;
1844 memset(&sin, 0, sizeof sin);
1845 sin.sin_family = AF_INET;
1846 sin.sin_addr = addr;
1849 memset(sa, 0, sizeof *sa);
1850 memcpy(sa, &sin, sizeof sin);
1854 do_set_addr(struct netdev *netdev,
1855 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1858 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1859 make_in4_sockaddr(&ifr.ifr_addr, addr);
1861 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1865 /* Adds 'router' as a default IP gateway. */
1867 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1869 struct in_addr any = { INADDR_ANY };
1873 memset(&rt, 0, sizeof rt);
1874 make_in4_sockaddr(&rt.rt_dst, any);
1875 make_in4_sockaddr(&rt.rt_gateway, router);
1876 make_in4_sockaddr(&rt.rt_genmask, any);
1877 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1878 COVERAGE_INC(netdev_add_router);
1879 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1881 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1887 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1890 static const char fn[] = "/proc/net/route";
1895 *netdev_name = NULL;
1896 stream = fopen(fn, "r");
1897 if (stream == NULL) {
1898 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1903 while (fgets(line, sizeof line, stream)) {
1906 uint32_t dest, gateway, mask;
1907 int refcnt, metric, mtu;
1908 unsigned int flags, use, window, irtt;
1911 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1913 iface, &dest, &gateway, &flags, &refcnt,
1914 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1916 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1920 if (!(flags & RTF_UP)) {
1921 /* Skip routes that aren't up. */
1925 /* The output of 'dest', 'mask', and 'gateway' were given in
1926 * network byte order, so we don't need need any endian
1927 * conversions here. */
1928 if ((dest & mask) == (host->s_addr & mask)) {
1930 /* The host is directly reachable. */
1931 next_hop->s_addr = 0;
1933 /* To reach the host, we must go through a gateway. */
1934 next_hop->s_addr = gateway;
1936 *netdev_name = xstrdup(iface);
1947 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1948 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1949 * returns 0. Otherwise, it returns a positive errno value; in particular,
1950 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1952 netdev_linux_arp_lookup(const struct netdev *netdev,
1953 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1956 struct sockaddr_in sin;
1959 memset(&r, 0, sizeof r);
1960 sin.sin_family = AF_INET;
1961 sin.sin_addr.s_addr = ip;
1963 memcpy(&r.arp_pa, &sin, sizeof sin);
1964 r.arp_ha.sa_family = ARPHRD_ETHER;
1966 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1967 COVERAGE_INC(netdev_arp_lookup);
1968 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1970 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1971 } else if (retval != ENXIO) {
1972 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1973 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1979 nd_to_iff_flags(enum netdev_flags nd)
1982 if (nd & NETDEV_UP) {
1985 if (nd & NETDEV_PROMISC) {
1992 iff_to_nd_flags(int iff)
1994 enum netdev_flags nd = 0;
1998 if (iff & IFF_PROMISC) {
1999 nd |= NETDEV_PROMISC;
2005 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2006 enum netdev_flags on, enum netdev_flags *old_flagsp)
2008 int old_flags, new_flags;
2011 error = get_flags(netdev, &old_flags);
2013 *old_flagsp = iff_to_nd_flags(old_flags);
2014 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2015 if (new_flags != old_flags) {
2016 error = set_flags(netdev, new_flags);
2023 poll_notify(struct list *list)
2025 struct netdev_linux_notifier *notifier;
2026 LIST_FOR_EACH (notifier, node, list) {
2027 struct netdev_notifier *n = ¬ifier->notifier;
2033 netdev_linux_poll_cb(const struct rtnetlink_change *change,
2034 void *aux OVS_UNUSED)
2037 struct list *list = shash_find_data(&netdev_linux_notifiers,
2043 struct shash_node *node;
2044 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2045 poll_notify(node->data);
2051 netdev_linux_poll_add(struct netdev *netdev,
2052 void (*cb)(struct netdev_notifier *), void *aux,
2053 struct netdev_notifier **notifierp)
2055 const char *netdev_name = netdev_get_name(netdev);
2056 struct netdev_linux_notifier *notifier;
2059 if (shash_is_empty(&netdev_linux_notifiers)) {
2060 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2061 netdev_linux_poll_cb, NULL);
2067 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2069 list = xmalloc(sizeof *list);
2071 shash_add(&netdev_linux_notifiers, netdev_name, list);
2074 notifier = xmalloc(sizeof *notifier);
2075 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2076 list_push_back(list, ¬ifier->node);
2077 *notifierp = ¬ifier->notifier;
2082 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2084 struct netdev_linux_notifier *notifier =
2085 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2088 /* Remove 'notifier' from its list. */
2089 list = list_remove(¬ifier->node);
2090 if (list_is_empty(list)) {
2091 /* The list is now empty. Remove it from the hash and free it. */
2092 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2093 shash_delete(&netdev_linux_notifiers,
2094 shash_find(&netdev_linux_notifiers, netdev_name));
2099 /* If that was the last notifier, unregister. */
2100 if (shash_is_empty(&netdev_linux_notifiers)) {
2101 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2105 const struct netdev_class netdev_linux_class = {
2112 netdev_linux_create_system,
2113 netdev_linux_destroy,
2114 NULL, /* reconfigure */
2119 netdev_linux_enumerate,
2122 netdev_linux_recv_wait,
2126 netdev_linux_send_wait,
2128 netdev_linux_set_etheraddr,
2129 netdev_linux_get_etheraddr,
2130 netdev_linux_get_mtu,
2131 netdev_linux_get_ifindex,
2132 netdev_linux_get_carrier,
2133 netdev_linux_get_stats,
2134 netdev_vport_set_stats,
2136 netdev_linux_get_features,
2137 netdev_linux_set_advertisements,
2138 netdev_linux_get_vlan_vid,
2140 netdev_linux_set_policing,
2141 netdev_linux_get_qos_types,
2142 netdev_linux_get_qos_capabilities,
2143 netdev_linux_get_qos,
2144 netdev_linux_set_qos,
2145 netdev_linux_get_queue,
2146 netdev_linux_set_queue,
2147 netdev_linux_delete_queue,
2148 netdev_linux_get_queue_stats,
2149 netdev_linux_dump_queues,
2150 netdev_linux_dump_queue_stats,
2152 netdev_linux_get_in4,
2153 netdev_linux_set_in4,
2154 netdev_linux_get_in6,
2155 netdev_linux_add_router,
2156 netdev_linux_get_next_hop,
2157 netdev_linux_arp_lookup,
2159 netdev_linux_update_flags,
2161 netdev_linux_poll_add,
2162 netdev_linux_poll_remove,
2165 const struct netdev_class netdev_tap_class = {
2172 netdev_linux_create_tap,
2173 netdev_linux_destroy,
2174 NULL, /* reconfigure */
2179 NULL, /* enumerate */
2182 netdev_linux_recv_wait,
2186 netdev_linux_send_wait,
2188 netdev_linux_set_etheraddr,
2189 netdev_linux_get_etheraddr,
2190 netdev_linux_get_mtu,
2191 netdev_linux_get_ifindex,
2192 netdev_linux_get_carrier,
2193 netdev_linux_get_stats,
2194 NULL, /* set_stats */
2196 netdev_linux_get_features,
2197 netdev_linux_set_advertisements,
2198 netdev_linux_get_vlan_vid,
2200 netdev_linux_set_policing,
2201 netdev_linux_get_qos_types,
2202 netdev_linux_get_qos_capabilities,
2203 netdev_linux_get_qos,
2204 netdev_linux_set_qos,
2205 netdev_linux_get_queue,
2206 netdev_linux_set_queue,
2207 netdev_linux_delete_queue,
2208 netdev_linux_get_queue_stats,
2209 netdev_linux_dump_queues,
2210 netdev_linux_dump_queue_stats,
2212 netdev_linux_get_in4,
2213 netdev_linux_set_in4,
2214 netdev_linux_get_in6,
2215 netdev_linux_add_router,
2216 netdev_linux_get_next_hop,
2217 netdev_linux_arp_lookup,
2219 netdev_linux_update_flags,
2221 netdev_linux_poll_add,
2222 netdev_linux_poll_remove,
2225 /* HTB traffic control class. */
2227 #define HTB_N_QUEUES 0xf000
2231 unsigned int max_rate; /* In bytes/s. */
2235 struct tc_queue tc_queue;
2236 unsigned int min_rate; /* In bytes/s. */
2237 unsigned int max_rate; /* In bytes/s. */
2238 unsigned int burst; /* In bytes. */
2239 unsigned int priority; /* Lower values are higher priorities. */
2243 htb_get__(const struct netdev *netdev)
2245 struct netdev_dev_linux *netdev_dev =
2246 netdev_dev_linux_cast(netdev_get_dev(netdev));
2247 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2251 htb_install__(struct netdev *netdev, uint64_t max_rate)
2253 struct netdev_dev_linux *netdev_dev =
2254 netdev_dev_linux_cast(netdev_get_dev(netdev));
2257 htb = xmalloc(sizeof *htb);
2258 tc_init(&htb->tc, &tc_ops_htb);
2259 htb->max_rate = max_rate;
2261 netdev_dev->tc = &htb->tc;
2266 /* Create an HTB qdisc.
2268 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2270 htb_setup_qdisc__(struct netdev *netdev)
2273 struct tc_htb_glob opt;
2274 struct ofpbuf request;
2275 struct tcmsg *tcmsg;
2277 tc_del_qdisc(netdev);
2279 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2280 NLM_F_EXCL | NLM_F_CREATE, &request);
2284 tcmsg->tcm_handle = tc_make_handle(1, 0);
2285 tcmsg->tcm_parent = TC_H_ROOT;
2287 nl_msg_put_string(&request, TCA_KIND, "htb");
2289 memset(&opt, 0, sizeof opt);
2290 opt.rate2quantum = 10;
2294 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2295 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2296 nl_msg_end_nested(&request, opt_offset);
2298 return tc_transact(&request, NULL);
2301 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2302 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2304 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2305 unsigned int parent, struct htb_class *class)
2308 struct tc_htb_opt opt;
2309 struct ofpbuf request;
2310 struct tcmsg *tcmsg;
2314 netdev_get_mtu(netdev, &mtu);
2316 memset(&opt, 0, sizeof opt);
2317 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2318 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2319 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2320 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2321 opt.prio = class->priority;
2323 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2327 tcmsg->tcm_handle = handle;
2328 tcmsg->tcm_parent = parent;
2330 nl_msg_put_string(&request, TCA_KIND, "htb");
2331 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2332 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2333 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2334 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2335 nl_msg_end_nested(&request, opt_offset);
2337 error = tc_transact(&request, NULL);
2339 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2340 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2341 netdev_get_name(netdev),
2342 tc_get_major(handle), tc_get_minor(handle),
2343 tc_get_major(parent), tc_get_minor(parent),
2344 class->min_rate, class->max_rate,
2345 class->burst, class->priority, strerror(error));
2350 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2351 * description of them into 'details'. The description complies with the
2352 * specification given in the vswitch database documentation for linux-htb
2355 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2357 static const struct nl_policy tca_htb_policy[] = {
2358 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2359 .min_len = sizeof(struct tc_htb_opt) },
2362 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2363 const struct tc_htb_opt *htb;
2365 if (!nl_parse_nested(nl_options, tca_htb_policy,
2366 attrs, ARRAY_SIZE(tca_htb_policy))) {
2367 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2371 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2372 class->min_rate = htb->rate.rate;
2373 class->max_rate = htb->ceil.rate;
2374 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2375 class->priority = htb->prio;
2380 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2381 struct htb_class *options,
2382 struct netdev_queue_stats *stats)
2384 struct nlattr *nl_options;
2385 unsigned int handle;
2388 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2389 if (!error && queue_id) {
2390 unsigned int major = tc_get_major(handle);
2391 unsigned int minor = tc_get_minor(handle);
2392 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2393 *queue_id = minor - 1;
2398 if (!error && options) {
2399 error = htb_parse_tca_options__(nl_options, options);
2405 htb_parse_qdisc_details__(struct netdev *netdev,
2406 const struct shash *details, struct htb_class *hc)
2408 const char *max_rate_s;
2410 max_rate_s = shash_find_data(details, "max-rate");
2411 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2412 if (!hc->max_rate) {
2415 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2416 hc->max_rate = netdev_features_to_bps(current) / 8;
2418 hc->min_rate = hc->max_rate;
2424 htb_parse_class_details__(struct netdev *netdev,
2425 const struct shash *details, struct htb_class *hc)
2427 const struct htb *htb = htb_get__(netdev);
2428 const char *min_rate_s = shash_find_data(details, "min-rate");
2429 const char *max_rate_s = shash_find_data(details, "max-rate");
2430 const char *burst_s = shash_find_data(details, "burst");
2431 const char *priority_s = shash_find_data(details, "priority");
2434 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2436 /* min-rate is required. */
2439 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2440 hc->min_rate = MAX(hc->min_rate, 1500);
2441 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2444 hc->max_rate = (max_rate_s
2445 ? strtoull(max_rate_s, NULL, 10) / 8
2447 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2448 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2452 * According to hints in the documentation that I've read, it is important
2453 * that 'burst' be at least as big as the largest frame that might be
2454 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2455 * but having it a bit too small is a problem. Since netdev_get_mtu()
2456 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2457 * the MTU. We actually add 64, instead of 14, as a guard against
2458 * additional headers get tacked on somewhere that we're not aware of. */
2459 netdev_get_mtu(netdev, &mtu);
2460 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2461 hc->burst = MAX(hc->burst, mtu + 64);
2464 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2470 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2471 unsigned int parent, struct htb_class *options,
2472 struct netdev_queue_stats *stats)
2474 struct ofpbuf *reply;
2477 error = tc_query_class(netdev, handle, parent, &reply);
2479 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2480 ofpbuf_delete(reply);
2486 htb_tc_install(struct netdev *netdev, const struct shash *details)
2490 error = htb_setup_qdisc__(netdev);
2492 struct htb_class hc;
2494 htb_parse_qdisc_details__(netdev, details, &hc);
2495 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2496 tc_make_handle(1, 0), &hc);
2498 htb_install__(netdev, hc.max_rate);
2504 static struct htb_class *
2505 htb_class_cast__(const struct tc_queue *queue)
2507 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2511 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2512 const struct htb_class *hc)
2514 struct htb *htb = htb_get__(netdev);
2515 size_t hash = hash_int(queue_id, 0);
2516 struct tc_queue *queue;
2517 struct htb_class *hcp;
2519 queue = tc_find_queue__(netdev, queue_id, hash);
2521 hcp = htb_class_cast__(queue);
2523 hcp = xmalloc(sizeof *hcp);
2524 queue = &hcp->tc_queue;
2525 queue->queue_id = queue_id;
2526 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2529 hcp->min_rate = hc->min_rate;
2530 hcp->max_rate = hc->max_rate;
2531 hcp->burst = hc->burst;
2532 hcp->priority = hc->priority;
2536 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2539 struct nl_dump dump;
2540 struct htb_class hc;
2543 /* Get qdisc options. */
2545 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2546 htb = htb_install__(netdev, hc.max_rate);
2549 if (!start_queue_dump(netdev, &dump)) {
2552 while (nl_dump_next(&dump, &msg)) {
2553 unsigned int queue_id;
2555 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2556 htb_update_queue__(netdev, queue_id, &hc);
2559 nl_dump_done(&dump);
2565 htb_tc_destroy(struct tc *tc)
2567 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2568 struct htb_class *hc, *next;
2570 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2571 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2579 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2581 const struct htb *htb = htb_get__(netdev);
2582 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2587 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2589 struct htb_class hc;
2592 htb_parse_qdisc_details__(netdev, details, &hc);
2593 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2594 tc_make_handle(1, 0), &hc);
2596 htb_get__(netdev)->max_rate = hc.max_rate;
2602 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2603 const struct tc_queue *queue, struct shash *details)
2605 const struct htb_class *hc = htb_class_cast__(queue);
2607 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2608 if (hc->min_rate != hc->max_rate) {
2609 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2611 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2613 shash_add(details, "priority", xasprintf("%u", hc->priority));
2619 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2620 const struct shash *details)
2622 struct htb_class hc;
2625 error = htb_parse_class_details__(netdev, details, &hc);
2630 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2631 tc_make_handle(1, 0xfffe), &hc);
2636 htb_update_queue__(netdev, queue_id, &hc);
2641 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2643 struct htb_class *hc = htb_class_cast__(queue);
2644 struct htb *htb = htb_get__(netdev);
2647 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2649 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2656 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2657 struct netdev_queue_stats *stats)
2659 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2660 tc_make_handle(1, 0xfffe), NULL, stats);
2664 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2665 const struct ofpbuf *nlmsg,
2666 netdev_dump_queue_stats_cb *cb, void *aux)
2668 struct netdev_queue_stats stats;
2669 unsigned int handle, major, minor;
2672 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2677 major = tc_get_major(handle);
2678 minor = tc_get_minor(handle);
2679 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2680 (*cb)(minor - 1, &stats, aux);
2685 static const struct tc_ops tc_ops_htb = {
2686 "htb", /* linux_name */
2687 "linux-htb", /* ovs_name */
2688 HTB_N_QUEUES, /* n_queues */
2697 htb_class_get_stats,
2698 htb_class_dump_stats
2701 /* "linux-hfsc" traffic control class. */
2703 #define HFSC_N_QUEUES 0xf000
2711 struct tc_queue tc_queue;
2716 static struct hfsc *
2717 hfsc_get__(const struct netdev *netdev)
2719 struct netdev_dev_linux *netdev_dev;
2720 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2721 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2724 static struct hfsc_class *
2725 hfsc_class_cast__(const struct tc_queue *queue)
2727 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2730 static struct hfsc *
2731 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2733 struct netdev_dev_linux * netdev_dev;
2736 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2737 hfsc = xmalloc(sizeof *hfsc);
2738 tc_init(&hfsc->tc, &tc_ops_hfsc);
2739 hfsc->max_rate = max_rate;
2740 netdev_dev->tc = &hfsc->tc;
2746 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2747 const struct hfsc_class *hc)
2751 struct hfsc_class *hcp;
2752 struct tc_queue *queue;
2754 hfsc = hfsc_get__(netdev);
2755 hash = hash_int(queue_id, 0);
2757 queue = tc_find_queue__(netdev, queue_id, hash);
2759 hcp = hfsc_class_cast__(queue);
2761 hcp = xmalloc(sizeof *hcp);
2762 queue = &hcp->tc_queue;
2763 queue->queue_id = queue_id;
2764 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2767 hcp->min_rate = hc->min_rate;
2768 hcp->max_rate = hc->max_rate;
2772 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2774 const struct tc_service_curve *rsc, *fsc, *usc;
2775 static const struct nl_policy tca_hfsc_policy[] = {
2777 .type = NL_A_UNSPEC,
2779 .min_len = sizeof(struct tc_service_curve),
2782 .type = NL_A_UNSPEC,
2784 .min_len = sizeof(struct tc_service_curve),
2787 .type = NL_A_UNSPEC,
2789 .min_len = sizeof(struct tc_service_curve),
2792 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2794 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2795 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2796 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2800 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2801 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2802 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2804 if (rsc->m1 != 0 || rsc->d != 0 ||
2805 fsc->m1 != 0 || fsc->d != 0 ||
2806 usc->m1 != 0 || usc->d != 0) {
2807 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2808 "Non-linear service curves are not supported.");
2812 if (rsc->m2 != fsc->m2) {
2813 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2814 "Real-time service curves are not supported ");
2818 if (rsc->m2 > usc->m2) {
2819 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2820 "Min-rate service curve is greater than "
2821 "the max-rate service curve.");
2825 class->min_rate = fsc->m2;
2826 class->max_rate = usc->m2;
2831 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2832 struct hfsc_class *options,
2833 struct netdev_queue_stats *stats)
2836 unsigned int handle;
2837 struct nlattr *nl_options;
2839 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2845 unsigned int major, minor;
2847 major = tc_get_major(handle);
2848 minor = tc_get_minor(handle);
2849 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2850 *queue_id = minor - 1;
2857 error = hfsc_parse_tca_options__(nl_options, options);
2864 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2865 unsigned int parent, struct hfsc_class *options,
2866 struct netdev_queue_stats *stats)
2869 struct ofpbuf *reply;
2871 error = tc_query_class(netdev, handle, parent, &reply);
2876 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2877 ofpbuf_delete(reply);
2882 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2883 struct hfsc_class *class)
2886 const char *max_rate_s;
2888 max_rate_s = shash_find_data(details, "max-rate");
2889 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2894 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2895 max_rate = netdev_features_to_bps(current) / 8;
2898 class->min_rate = max_rate;
2899 class->max_rate = max_rate;
2903 hfsc_parse_class_details__(struct netdev *netdev,
2904 const struct shash *details,
2905 struct hfsc_class * class)
2907 const struct hfsc *hfsc;
2908 uint32_t min_rate, max_rate;
2909 const char *min_rate_s, *max_rate_s;
2911 hfsc = hfsc_get__(netdev);
2912 min_rate_s = shash_find_data(details, "min-rate");
2913 max_rate_s = shash_find_data(details, "max-rate");
2919 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2920 min_rate = MAX(min_rate, 1500);
2921 min_rate = MIN(min_rate, hfsc->max_rate);
2923 max_rate = (max_rate_s
2924 ? strtoull(max_rate_s, NULL, 10) / 8
2926 max_rate = MAX(max_rate, min_rate);
2927 max_rate = MIN(max_rate, hfsc->max_rate);
2929 class->min_rate = min_rate;
2930 class->max_rate = max_rate;
2935 /* Create an HFSC qdisc.
2937 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2939 hfsc_setup_qdisc__(struct netdev * netdev)
2941 struct tcmsg *tcmsg;
2942 struct ofpbuf request;
2943 struct tc_hfsc_qopt opt;
2945 tc_del_qdisc(netdev);
2947 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2948 NLM_F_EXCL | NLM_F_CREATE, &request);
2954 tcmsg->tcm_handle = tc_make_handle(1, 0);
2955 tcmsg->tcm_parent = TC_H_ROOT;
2957 memset(&opt, 0, sizeof opt);
2960 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2961 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2963 return tc_transact(&request, NULL);
2966 /* Create an HFSC class.
2968 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2969 * sc rate <min_rate> ul rate <max_rate>" */
2971 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
2972 unsigned int parent, struct hfsc_class *class)
2976 struct tcmsg *tcmsg;
2977 struct ofpbuf request;
2978 struct tc_service_curve min, max;
2980 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2986 tcmsg->tcm_handle = handle;
2987 tcmsg->tcm_parent = parent;
2991 min.m2 = class->min_rate;
2995 max.m2 = class->max_rate;
2997 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2998 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2999 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3000 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3001 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3002 nl_msg_end_nested(&request, opt_offset);
3004 error = tc_transact(&request, NULL);
3006 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3007 "min-rate %ubps, max-rate %ubps (%s)",
3008 netdev_get_name(netdev),
3009 tc_get_major(handle), tc_get_minor(handle),
3010 tc_get_major(parent), tc_get_minor(parent),
3011 class->min_rate, class->max_rate, strerror(error));
3018 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3021 struct hfsc_class class;
3023 error = hfsc_setup_qdisc__(netdev);
3029 hfsc_parse_qdisc_details__(netdev, details, &class);
3030 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3031 tc_make_handle(1, 0), &class);
3037 hfsc_install__(netdev, class.max_rate);
3042 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3046 struct nl_dump dump;
3047 struct hfsc_class hc;
3050 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3051 hfsc = hfsc_install__(netdev, hc.max_rate);
3053 if (!start_queue_dump(netdev, &dump)) {
3057 while (nl_dump_next(&dump, &msg)) {
3058 unsigned int queue_id;
3060 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3061 hfsc_update_queue__(netdev, queue_id, &hc);
3065 nl_dump_done(&dump);
3070 hfsc_tc_destroy(struct tc *tc)
3073 struct hfsc_class *hc, *next;
3075 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3077 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3078 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3087 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3089 const struct hfsc *hfsc;
3090 hfsc = hfsc_get__(netdev);
3091 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3096 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3099 struct hfsc_class class;
3101 hfsc_parse_qdisc_details__(netdev, details, &class);
3102 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3103 tc_make_handle(1, 0), &class);
3106 hfsc_get__(netdev)->max_rate = class.max_rate;
3113 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3114 const struct tc_queue *queue, struct shash *details)
3116 const struct hfsc_class *hc;
3118 hc = hfsc_class_cast__(queue);
3119 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3120 if (hc->min_rate != hc->max_rate) {
3121 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3127 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3128 const struct shash *details)
3131 struct hfsc_class class;
3133 error = hfsc_parse_class_details__(netdev, details, &class);
3138 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3139 tc_make_handle(1, 0xfffe), &class);
3144 hfsc_update_queue__(netdev, queue_id, &class);
3149 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3153 struct hfsc_class *hc;
3155 hc = hfsc_class_cast__(queue);
3156 hfsc = hfsc_get__(netdev);
3158 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3160 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3167 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3168 struct netdev_queue_stats *stats)
3170 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3171 tc_make_handle(1, 0xfffe), NULL, stats);
3175 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3176 const struct ofpbuf *nlmsg,
3177 netdev_dump_queue_stats_cb *cb, void *aux)
3179 struct netdev_queue_stats stats;
3180 unsigned int handle, major, minor;
3183 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3188 major = tc_get_major(handle);
3189 minor = tc_get_minor(handle);
3190 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3191 (*cb)(minor - 1, &stats, aux);
3196 static const struct tc_ops tc_ops_hfsc = {
3197 "hfsc", /* linux_name */
3198 "linux-hfsc", /* ovs_name */
3199 HFSC_N_QUEUES, /* n_queues */
3200 hfsc_tc_install, /* tc_install */
3201 hfsc_tc_load, /* tc_load */
3202 hfsc_tc_destroy, /* tc_destroy */
3203 hfsc_qdisc_get, /* qdisc_get */
3204 hfsc_qdisc_set, /* qdisc_set */
3205 hfsc_class_get, /* class_get */
3206 hfsc_class_set, /* class_set */
3207 hfsc_class_delete, /* class_delete */
3208 hfsc_class_get_stats, /* class_get_stats */
3209 hfsc_class_dump_stats /* class_dump_stats */
3212 /* "linux-default" traffic control class.
3214 * This class represents the default, unnamed Linux qdisc. It corresponds to
3215 * the "" (empty string) QoS type in the OVS database. */
3218 default_install__(struct netdev *netdev)
3220 struct netdev_dev_linux *netdev_dev =
3221 netdev_dev_linux_cast(netdev_get_dev(netdev));
3222 static struct tc *tc;
3225 tc = xmalloc(sizeof *tc);
3226 tc_init(tc, &tc_ops_default);
3228 netdev_dev->tc = tc;
3232 default_tc_install(struct netdev *netdev,
3233 const struct shash *details OVS_UNUSED)
3235 default_install__(netdev);
3240 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3242 default_install__(netdev);
3246 static const struct tc_ops tc_ops_default = {
3247 NULL, /* linux_name */
3252 NULL, /* tc_destroy */
3253 NULL, /* qdisc_get */
3254 NULL, /* qdisc_set */
3255 NULL, /* class_get */
3256 NULL, /* class_set */
3257 NULL, /* class_delete */
3258 NULL, /* class_get_stats */
3259 NULL /* class_dump_stats */
3262 /* "linux-other" traffic control class.
3267 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3269 struct netdev_dev_linux *netdev_dev =
3270 netdev_dev_linux_cast(netdev_get_dev(netdev));
3271 static struct tc *tc;
3274 tc = xmalloc(sizeof *tc);
3275 tc_init(tc, &tc_ops_other);
3277 netdev_dev->tc = tc;
3281 static const struct tc_ops tc_ops_other = {
3282 NULL, /* linux_name */
3283 "linux-other", /* ovs_name */
3285 NULL, /* tc_install */
3287 NULL, /* tc_destroy */
3288 NULL, /* qdisc_get */
3289 NULL, /* qdisc_set */
3290 NULL, /* class_get */
3291 NULL, /* class_set */
3292 NULL, /* class_delete */
3293 NULL, /* class_get_stats */
3294 NULL /* class_dump_stats */
3297 /* Traffic control. */
3299 /* Number of kernel "tc" ticks per second. */
3300 static double ticks_per_s;
3302 /* Number of kernel "jiffies" per second. This is used for the purpose of
3303 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3304 * one jiffy's worth of data.
3306 * There are two possibilities here:
3308 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3309 * approximate range of 100 to 1024. That means that we really need to
3310 * make sure that the qdisc can buffer that much data.
3312 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3313 * has finely granular timers and there's no need to fudge additional room
3314 * for buffers. (There's no extra effort needed to implement that: the
3315 * large 'buffer_hz' is used as a divisor, so practically any number will
3316 * come out as 0 in the division. Small integer results in the case of
3317 * really high dividends won't have any real effect anyhow.)
3319 static unsigned int buffer_hz;
3321 /* Returns tc handle 'major':'minor'. */
3323 tc_make_handle(unsigned int major, unsigned int minor)
3325 return TC_H_MAKE(major << 16, minor);
3328 /* Returns the major number from 'handle'. */
3330 tc_get_major(unsigned int handle)
3332 return TC_H_MAJ(handle) >> 16;
3335 /* Returns the minor number from 'handle'. */
3337 tc_get_minor(unsigned int handle)
3339 return TC_H_MIN(handle);
3342 static struct tcmsg *
3343 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3344 struct ofpbuf *request)
3346 struct tcmsg *tcmsg;
3350 error = get_ifindex(netdev, &ifindex);
3355 ofpbuf_init(request, 512);
3356 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3357 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3358 tcmsg->tcm_family = AF_UNSPEC;
3359 tcmsg->tcm_ifindex = ifindex;
3360 /* Caller should fill in tcmsg->tcm_handle. */
3361 /* Caller should fill in tcmsg->tcm_parent. */
3367 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3369 int error = nl_sock_transact(rtnl_sock, request, replyp);
3370 ofpbuf_uninit(request);
3377 /* The values in psched are not individually very meaningful, but they are
3378 * important. The tables below show some values seen in the wild.
3382 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3383 * (Before that, there are hints that it was 1000000000.)
3385 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3389 * -----------------------------------
3390 * [1] 000c8000 000f4240 000f4240 00000064
3391 * [2] 000003e8 00000400 000f4240 3b9aca00
3392 * [3] 000003e8 00000400 000f4240 3b9aca00
3393 * [4] 000003e8 00000400 000f4240 00000064
3394 * [5] 000003e8 00000040 000f4240 3b9aca00
3395 * [6] 000003e8 00000040 000f4240 000000f9
3397 * a b c d ticks_per_s buffer_hz
3398 * ------- --------- ---------- ------------- ----------- -------------
3399 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3400 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3401 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3402 * [4] 1,000 1,024 1,000,000 100 976,562 100
3403 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3404 * [6] 1,000 64 1,000,000 249 15,625,000 249
3406 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3407 * [2] 2.6.26-1-686-bigmem from Debian lenny
3408 * [3] 2.6.26-2-sparc64 from Debian lenny
3409 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3410 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3411 * [6] 2.6.34 from kernel.org on KVM
3413 static const char fn[] = "/proc/net/psched";
3414 unsigned int a, b, c, d;
3420 stream = fopen(fn, "r");
3422 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3426 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3427 VLOG_WARN("%s: read failed", fn);
3431 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3435 VLOG_WARN("%s: invalid scheduler parameters", fn);
3439 ticks_per_s = (double) a * c / b;
3443 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3446 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3449 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3450 * rate of 'rate' bytes per second. */
3452 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3457 return (rate * ticks) / ticks_per_s;
3460 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3461 * rate of 'rate' bytes per second. */
3463 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3468 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3471 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3472 * a transmission rate of 'rate' bytes per second. */
3474 tc_buffer_per_jiffy(unsigned int rate)
3479 return rate / buffer_hz;
3482 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3483 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3484 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3485 * stores NULL into it if it is absent.
3487 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3490 * Returns 0 if successful, otherwise a positive errno value. */
3492 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3493 struct nlattr **options)
3495 static const struct nl_policy tca_policy[] = {
3496 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3497 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3499 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3501 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3502 tca_policy, ta, ARRAY_SIZE(ta))) {
3503 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3508 *kind = nl_attr_get_string(ta[TCA_KIND]);
3512 *options = ta[TCA_OPTIONS];
3527 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3528 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3529 * into '*options', and its queue statistics into '*stats'. Any of the output
3530 * arguments may be null.
3532 * Returns 0 if successful, otherwise a positive errno value. */
3534 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3535 struct nlattr **options, struct netdev_queue_stats *stats)
3537 static const struct nl_policy tca_policy[] = {
3538 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3539 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3541 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3543 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3544 tca_policy, ta, ARRAY_SIZE(ta))) {
3545 VLOG_WARN_RL(&rl, "failed to parse class message");
3550 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3551 *handlep = tc->tcm_handle;
3555 *options = ta[TCA_OPTIONS];
3559 const struct gnet_stats_queue *gsq;
3560 struct gnet_stats_basic gsb;
3562 static const struct nl_policy stats_policy[] = {
3563 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3564 .min_len = sizeof gsb },
3565 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3566 .min_len = sizeof *gsq },
3568 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3570 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3571 sa, ARRAY_SIZE(sa))) {
3572 VLOG_WARN_RL(&rl, "failed to parse class stats");
3576 /* Alignment issues screw up the length of struct gnet_stats_basic on
3577 * some arch/bitsize combinations. Newer versions of Linux have a
3578 * struct gnet_stats_basic_packed, but we can't depend on that. The
3579 * easiest thing to do is just to make a copy. */
3580 memset(&gsb, 0, sizeof gsb);
3581 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3582 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3583 stats->tx_bytes = gsb.bytes;
3584 stats->tx_packets = gsb.packets;
3586 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3587 stats->tx_errors = gsq->drops;
3597 memset(stats, 0, sizeof *stats);
3602 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3605 tc_query_class(const struct netdev *netdev,
3606 unsigned int handle, unsigned int parent,
3607 struct ofpbuf **replyp)
3609 struct ofpbuf request;
3610 struct tcmsg *tcmsg;
3613 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3617 tcmsg->tcm_handle = handle;
3618 tcmsg->tcm_parent = parent;
3620 error = tc_transact(&request, replyp);
3622 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3623 netdev_get_name(netdev),
3624 tc_get_major(handle), tc_get_minor(handle),
3625 tc_get_major(parent), tc_get_minor(parent),
3631 /* Equivalent to "tc class del dev <name> handle <handle>". */
3633 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3635 struct ofpbuf request;
3636 struct tcmsg *tcmsg;
3639 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3643 tcmsg->tcm_handle = handle;
3644 tcmsg->tcm_parent = 0;
3646 error = tc_transact(&request, NULL);
3648 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3649 netdev_get_name(netdev),
3650 tc_get_major(handle), tc_get_minor(handle),
3656 /* Equivalent to "tc qdisc del dev <name> root". */
3658 tc_del_qdisc(struct netdev *netdev)
3660 struct netdev_dev_linux *netdev_dev =
3661 netdev_dev_linux_cast(netdev_get_dev(netdev));
3662 struct ofpbuf request;
3663 struct tcmsg *tcmsg;
3666 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3670 tcmsg->tcm_handle = tc_make_handle(1, 0);
3671 tcmsg->tcm_parent = TC_H_ROOT;
3673 error = tc_transact(&request, NULL);
3674 if (error == EINVAL) {
3675 /* EINVAL probably means that the default qdisc was in use, in which
3676 * case we've accomplished our purpose. */
3679 if (!error && netdev_dev->tc) {
3680 if (netdev_dev->tc->ops->tc_destroy) {
3681 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3683 netdev_dev->tc = NULL;
3688 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3689 * kernel to determine what they are. Returns 0 if successful, otherwise a
3690 * positive errno value. */
3692 tc_query_qdisc(const struct netdev *netdev)
3694 struct netdev_dev_linux *netdev_dev =
3695 netdev_dev_linux_cast(netdev_get_dev(netdev));
3696 struct ofpbuf request, *qdisc;
3697 const struct tc_ops *ops;
3698 struct tcmsg *tcmsg;
3702 if (netdev_dev->tc) {
3706 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3707 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3708 * 2.6.35 without that fix backported to it.
3710 * To avoid the OOPS, we must not make a request that would attempt to dump
3711 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3712 * few others. There are a few ways that I can see to do this, but most of
3713 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3714 * technique chosen here is to assume that any non-default qdisc that we
3715 * create will have a class with handle 1:0. The built-in qdiscs only have
3716 * a class with handle 0:0.
3718 * We could check for Linux 2.6.35+ and use a more straightforward method
3720 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3724 tcmsg->tcm_handle = tc_make_handle(1, 0);
3725 tcmsg->tcm_parent = 0;
3727 /* Figure out what tc class to instantiate. */
3728 error = tc_transact(&request, &qdisc);
3732 error = tc_parse_qdisc(qdisc, &kind, NULL);
3734 ops = &tc_ops_other;
3736 ops = tc_lookup_linux_name(kind);
3738 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3739 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3741 ops = &tc_ops_other;
3744 } else if (error == ENOENT) {
3745 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3746 * other entity that doesn't have a handle 1:0. We will assume
3747 * that it's the system default qdisc. */
3748 ops = &tc_ops_default;
3751 /* Who knows? Maybe the device got deleted. */
3752 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3753 netdev_get_name(netdev), strerror(error));
3754 ops = &tc_ops_other;
3757 /* Instantiate it. */
3758 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3759 assert((load_error == 0) == (netdev_dev->tc != NULL));
3760 ofpbuf_delete(qdisc);
3762 return error ? error : load_error;
3765 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3766 approximate the time to transmit packets of various lengths. For an MTU of
3767 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3768 represents two possible packet lengths; for a MTU of 513 through 1024, four
3769 possible lengths; and so on.
3771 Returns, for the specified 'mtu', the number of bits that packet lengths
3772 need to be shifted right to fit within such a 256-entry table. */
3774 tc_calc_cell_log(unsigned int mtu)
3779 mtu = ETH_PAYLOAD_MAX;
3781 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3783 for (cell_log = 0; mtu >= 256; cell_log++) {
3790 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3793 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3795 memset(rate, 0, sizeof *rate);
3796 rate->cell_log = tc_calc_cell_log(mtu);
3797 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3798 /* rate->cell_align = 0; */ /* distro headers. */
3799 rate->mpu = ETH_TOTAL_MIN;
3803 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3804 * attribute of the specified "type".
3806 * See tc_calc_cell_log() above for a description of "rtab"s. */
3808 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3813 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3814 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3815 unsigned packet_size = (i + 1) << rate->cell_log;
3816 if (packet_size < rate->mpu) {
3817 packet_size = rate->mpu;
3819 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3823 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3824 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3825 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3828 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3830 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3831 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3835 /* Utility functions. */
3838 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3840 /* Policy for RTNLGRP_LINK messages.
3842 * There are *many* more fields in these messages, but currently we only
3843 * care about these fields. */
3844 static const struct nl_policy rtnlgrp_link_policy[] = {
3845 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3846 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3847 .min_len = sizeof(struct rtnl_link_stats) },
3850 struct ofpbuf request;
3851 struct ofpbuf *reply;
3852 struct ifinfomsg *ifi;
3853 const struct rtnl_link_stats *rtnl_stats;
3854 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3857 ofpbuf_init(&request, 0);
3858 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3859 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3860 ifi->ifi_family = PF_UNSPEC;
3861 ifi->ifi_index = ifindex;
3862 error = nl_sock_transact(rtnl_sock, &request, &reply);
3863 ofpbuf_uninit(&request);
3868 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3869 rtnlgrp_link_policy,
3870 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3871 ofpbuf_delete(reply);
3875 if (!attrs[IFLA_STATS]) {
3876 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3877 ofpbuf_delete(reply);
3881 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3882 stats->rx_packets = rtnl_stats->rx_packets;
3883 stats->tx_packets = rtnl_stats->tx_packets;
3884 stats->rx_bytes = rtnl_stats->rx_bytes;
3885 stats->tx_bytes = rtnl_stats->tx_bytes;
3886 stats->rx_errors = rtnl_stats->rx_errors;
3887 stats->tx_errors = rtnl_stats->tx_errors;
3888 stats->rx_dropped = rtnl_stats->rx_dropped;
3889 stats->tx_dropped = rtnl_stats->tx_dropped;
3890 stats->multicast = rtnl_stats->multicast;
3891 stats->collisions = rtnl_stats->collisions;
3892 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3893 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3894 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3895 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3896 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3897 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3898 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3899 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3900 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3901 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3902 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3904 ofpbuf_delete(reply);
3910 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3912 static const char fn[] = "/proc/net/dev";
3917 stream = fopen(fn, "r");
3919 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3924 while (fgets(line, sizeof line, stream)) {
3927 #define X64 "%"SCNu64
3930 X64 X64 X64 X64 X64 X64 X64 "%*u"
3931 X64 X64 X64 X64 X64 X64 X64 "%*u",
3937 &stats->rx_fifo_errors,
3938 &stats->rx_frame_errors,
3944 &stats->tx_fifo_errors,
3946 &stats->tx_carrier_errors) != 15) {
3947 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3948 } else if (!strcmp(devname, netdev_name)) {
3949 stats->rx_length_errors = UINT64_MAX;
3950 stats->rx_over_errors = UINT64_MAX;
3951 stats->rx_crc_errors = UINT64_MAX;
3952 stats->rx_missed_errors = UINT64_MAX;
3953 stats->tx_aborted_errors = UINT64_MAX;
3954 stats->tx_heartbeat_errors = UINT64_MAX;
3955 stats->tx_window_errors = UINT64_MAX;
3961 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3967 get_flags(const struct netdev *netdev, int *flags)
3972 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3974 *flags = ifr.ifr_flags;
3979 set_flags(struct netdev *netdev, int flags)
3983 ifr.ifr_flags = flags;
3984 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3989 do_get_ifindex(const char *netdev_name)
3993 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3994 COVERAGE_INC(netdev_get_ifindex);
3995 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3996 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3997 netdev_name, strerror(errno));
4000 return ifr.ifr_ifindex;
4004 get_ifindex(const struct netdev *netdev_, int *ifindexp)
4006 struct netdev_dev_linux *netdev_dev =
4007 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4009 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4010 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4014 netdev_dev->cache_valid |= VALID_IFINDEX;
4015 netdev_dev->ifindex = ifindex;
4017 *ifindexp = netdev_dev->ifindex;
4022 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4027 memset(&ifr, 0, sizeof ifr);
4028 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4029 COVERAGE_INC(netdev_get_hwaddr);
4030 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4031 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4032 netdev_name, strerror(errno));
4035 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4036 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4037 VLOG_WARN("%s device has unknown hardware address family %d",
4038 netdev_name, hwaddr_family);
4040 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4045 set_etheraddr(const char *netdev_name, int hwaddr_family,
4046 const uint8_t mac[ETH_ADDR_LEN])
4050 memset(&ifr, 0, sizeof ifr);
4051 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4052 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4053 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4054 COVERAGE_INC(netdev_set_hwaddr);
4055 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4056 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4057 netdev_name, strerror(errno));
4064 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4065 int cmd, const char *cmd_name)
4069 memset(&ifr, 0, sizeof ifr);
4070 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4071 ifr.ifr_data = (caddr_t) ecmd;
4074 COVERAGE_INC(netdev_ethtool);
4075 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4078 if (errno != EOPNOTSUPP) {
4079 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4080 "failed: %s", cmd_name, name, strerror(errno));
4082 /* The device doesn't support this operation. That's pretty
4083 * common, so there's no point in logging anything. */
4090 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4091 const char *cmd_name)
4093 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4094 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4095 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4103 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4104 int cmd, const char *cmd_name)
4109 ifr.ifr_addr.sa_family = AF_INET;
4110 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4112 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4113 *ip = sin->sin_addr;