2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
57 #include "openflow/openflow.h"
59 #include "poll-loop.h"
60 #include "rtnetlink.h"
61 #include "socket-util.h"
66 VLOG_DEFINE_THIS_MODULE(netdev_linux);
68 COVERAGE_DEFINE(netdev_get_vlan_vid);
69 COVERAGE_DEFINE(netdev_set_policing);
70 COVERAGE_DEFINE(netdev_arp_lookup);
71 COVERAGE_DEFINE(netdev_get_ifindex);
72 COVERAGE_DEFINE(netdev_get_hwaddr);
73 COVERAGE_DEFINE(netdev_set_hwaddr);
74 COVERAGE_DEFINE(netdev_ethtool);
76 /* These were introduced in Linux 2.6.14, so they might be missing if we have
78 #ifndef ADVERTISED_Pause
79 #define ADVERTISED_Pause (1 << 13)
81 #ifndef ADVERTISED_Asym_Pause
82 #define ADVERTISED_Asym_Pause (1 << 14)
85 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
88 #define TC_RTAB_SIZE 1024
91 static struct rtnetlink_notifier netdev_linux_cache_notifier;
92 static int cache_notifier_refcount;
95 VALID_IFINDEX = 1 << 0,
96 VALID_ETHERADDR = 1 << 1,
100 VALID_CARRIER = 1 << 5,
101 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
102 VALID_POLICING = 1 << 7,
103 VALID_HAVE_VPORT_STATS = 1 << 8
111 /* Traffic control. */
113 /* An instance of a traffic control class. Always associated with a particular
116 * Each TC implementation subclasses this with whatever additional data it
119 const struct tc_ops *ops;
120 struct hmap queues; /* Contains "struct tc_queue"s.
121 * Read by generic TC layer.
122 * Written only by TC implementation. */
125 /* One traffic control queue.
127 * Each TC implementation subclasses this with whatever additional data it
130 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
131 unsigned int queue_id; /* OpenFlow queue ID. */
134 /* A particular kind of traffic control. Each implementation generally maps to
135 * one particular Linux qdisc class.
137 * The functions below return 0 if successful or a positive errno value on
138 * failure, except where otherwise noted. All of them must be provided, except
139 * where otherwise noted. */
141 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
142 * This is null for tc_ops_default and tc_ops_other, for which there are no
143 * appropriate values. */
144 const char *linux_name;
146 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
147 const char *ovs_name;
149 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
150 * queues. The queues are numbered 0 through n_queues - 1. */
151 unsigned int n_queues;
153 /* Called to install this TC class on 'netdev'. The implementation should
154 * make the Netlink calls required to set up 'netdev' with the right qdisc
155 * and configure it according to 'details'. The implementation may assume
156 * that the current qdisc is the default; that is, there is no need for it
157 * to delete the current qdisc before installing itself.
159 * The contents of 'details' should be documented as valid for 'ovs_name'
160 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
161 * (which is built as ovs-vswitchd.conf.db(8)).
163 * This function must return 0 if and only if it sets 'netdev->tc' to an
164 * initialized 'struct tc'.
166 * (This function is null for tc_ops_other, which cannot be installed. For
167 * other TC classes it should always be nonnull.) */
168 int (*tc_install)(struct netdev *netdev, const struct shash *details);
170 /* Called when the netdev code determines (through a Netlink query) that
171 * this TC class's qdisc is installed on 'netdev', but we didn't install
172 * it ourselves and so don't know any of the details.
174 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
175 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
176 * implementation should parse the other attributes of 'nlmsg' as
177 * necessary to determine its configuration. If necessary it should also
178 * use Netlink queries to determine the configuration of queues on
181 * This function must return 0 if and only if it sets 'netdev->tc' to an
182 * initialized 'struct tc'. */
183 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
185 /* Destroys the data structures allocated by the implementation as part of
186 * 'tc'. (This includes destroying 'tc->queues' by calling
189 * The implementation should not need to perform any Netlink calls. If
190 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
191 * (But it may not be desirable.)
193 * This function may be null if 'tc' is trivial. */
194 void (*tc_destroy)(struct tc *tc);
196 /* Retrieves details of 'netdev->tc' configuration into 'details'.
198 * The implementation should not need to perform any Netlink calls, because
199 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
200 * cached the configuration.
202 * The contents of 'details' should be documented as valid for 'ovs_name'
203 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
204 * (which is built as ovs-vswitchd.conf.db(8)).
206 * This function may be null if 'tc' is not configurable.
208 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
210 /* Reconfigures 'netdev->tc' according to 'details', performing any
211 * required Netlink calls to complete the reconfiguration.
213 * The contents of 'details' should be documented as valid for 'ovs_name'
214 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
215 * (which is built as ovs-vswitchd.conf.db(8)).
217 * This function may be null if 'tc' is not configurable.
219 int (*qdisc_set)(struct netdev *, const struct shash *details);
221 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
222 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
224 * The contents of 'details' should be documented as valid for 'ovs_name'
225 * in the "other_config" column in the "Queue" table in
226 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
228 * The implementation should not need to perform any Netlink calls, because
229 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
230 * cached the queue configuration.
232 * This function may be null if 'tc' does not have queues ('n_queues' is
234 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
235 struct shash *details);
237 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
238 * 'details', perfoming any required Netlink calls to complete the
239 * reconfiguration. The caller ensures that 'queue_id' is less than
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "Queue" table in
244 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
246 * This function may be null if 'tc' does not have queues or its queues are
247 * not configurable. */
248 int (*class_set)(struct netdev *, unsigned int queue_id,
249 const struct shash *details);
251 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
252 * tc_queue's within 'netdev->tc->queues'.
254 * This function may be null if 'tc' does not have queues or its queues
255 * cannot be deleted. */
256 int (*class_delete)(struct netdev *, struct tc_queue *queue);
258 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
259 * 'struct tc_queue's within 'netdev->tc->queues'.
261 * On success, initializes '*stats'.
263 * This function may be null if 'tc' does not have queues or if it cannot
264 * report queue statistics. */
265 int (*class_get_stats)(const struct netdev *netdev,
266 const struct tc_queue *queue,
267 struct netdev_queue_stats *stats);
269 /* Extracts queue stats from 'nlmsg', which is a response to a
270 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
272 * This function may be null if 'tc' does not have queues or if it cannot
273 * report queue statistics. */
274 int (*class_dump_stats)(const struct netdev *netdev,
275 const struct ofpbuf *nlmsg,
276 netdev_dump_queue_stats_cb *cb, void *aux);
280 tc_init(struct tc *tc, const struct tc_ops *ops)
283 hmap_init(&tc->queues);
287 tc_destroy(struct tc *tc)
289 hmap_destroy(&tc->queues);
292 static const struct tc_ops tc_ops_htb;
293 static const struct tc_ops tc_ops_hfsc;
294 static const struct tc_ops tc_ops_default;
295 static const struct tc_ops tc_ops_other;
297 static const struct tc_ops *tcs[] = {
298 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
299 &tc_ops_hfsc, /* Hierarchical fair service curve. */
300 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
301 &tc_ops_other, /* Some other qdisc. */
305 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
306 static unsigned int tc_get_major(unsigned int handle);
307 static unsigned int tc_get_minor(unsigned int handle);
309 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
310 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
311 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
313 static struct tcmsg *tc_make_request(const struct netdev *, int type,
314 unsigned int flags, struct ofpbuf *);
315 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
317 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
318 struct nlattr **options);
319 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
320 struct nlattr **options,
321 struct netdev_queue_stats *);
322 static int tc_query_class(const struct netdev *,
323 unsigned int handle, unsigned int parent,
324 struct ofpbuf **replyp);
325 static int tc_delete_class(const struct netdev *, unsigned int handle);
327 static int tc_del_qdisc(struct netdev *netdev);
328 static int tc_query_qdisc(const struct netdev *netdev);
330 static int tc_calc_cell_log(unsigned int mtu);
331 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
332 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
333 const struct tc_ratespec *rate);
334 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
336 struct netdev_dev_linux {
337 struct netdev_dev netdev_dev;
339 struct shash_node *shash_node;
340 unsigned int cache_valid;
342 /* The following are figured out "on demand" only. They are only valid
343 * when the corresponding VALID_* bit in 'cache_valid' is set. */
345 uint8_t etheraddr[ETH_ADDR_LEN];
346 struct in_addr address, netmask;
350 bool is_internal; /* Is this an openvswitch internal device? */
351 bool is_tap; /* Is this a tuntap device? */
352 uint32_t kbits_rate; /* Policing data. */
353 uint32_t kbits_burst;
354 bool have_vport_stats;
358 struct tap_state tap;
362 struct netdev_linux {
363 struct netdev netdev;
367 /* An AF_INET socket (used for ioctl operations). */
368 static int af_inet_sock = -1;
370 /* A Netlink routing socket that is not subscribed to any multicast groups. */
371 static struct nl_sock *rtnl_sock;
373 struct netdev_linux_notifier {
374 struct netdev_notifier notifier;
378 static struct shash netdev_linux_notifiers =
379 SHASH_INITIALIZER(&netdev_linux_notifiers);
380 static struct rtnetlink_notifier netdev_linux_poll_notifier;
382 /* This is set pretty low because we probably won't learn anything from the
383 * additional log messages. */
384 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
386 static int netdev_linux_init(void);
388 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
389 int cmd, const char *cmd_name);
390 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
391 const char *cmd_name);
392 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
393 int cmd, const char *cmd_name);
394 static int get_flags(const struct netdev *, int *flagsp);
395 static int set_flags(struct netdev *, int flags);
396 static int do_get_ifindex(const char *netdev_name);
397 static int get_ifindex(const struct netdev *, int *ifindexp);
398 static int do_set_addr(struct netdev *netdev,
399 int ioctl_nr, const char *ioctl_name,
400 struct in_addr addr);
401 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
402 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
403 const uint8_t[ETH_ADDR_LEN]);
404 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
405 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
408 is_netdev_linux_class(const struct netdev_class *netdev_class)
410 return netdev_class->init == netdev_linux_init;
413 static struct netdev_dev_linux *
414 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
416 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
417 assert(is_netdev_linux_class(netdev_class));
419 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
422 static struct netdev_linux *
423 netdev_linux_cast(const struct netdev *netdev)
425 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
426 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
427 assert(is_netdev_linux_class(netdev_class));
429 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
433 netdev_linux_init(void)
435 static int status = -1;
437 /* Create AF_INET socket. */
438 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
439 status = af_inet_sock >= 0 ? 0 : errno;
441 VLOG_ERR("failed to create inet socket: %s", strerror(status));
444 /* Create rtnetlink socket. */
446 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
448 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
457 netdev_linux_run(void)
459 rtnetlink_notifier_run();
463 netdev_linux_wait(void)
465 rtnetlink_notifier_wait();
469 netdev_linux_cache_cb(const struct rtnetlink_change *change,
470 void *aux OVS_UNUSED)
472 struct netdev_dev_linux *dev;
474 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
476 const struct netdev_class *netdev_class =
477 netdev_dev_get_class(base_dev);
479 if (is_netdev_linux_class(netdev_class)) {
480 dev = netdev_dev_linux_cast(base_dev);
481 dev->cache_valid = 0;
485 struct shash device_shash;
486 struct shash_node *node;
488 shash_init(&device_shash);
489 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
490 SHASH_FOR_EACH (node, &device_shash) {
492 dev->cache_valid = 0;
494 shash_destroy(&device_shash);
498 /* Creates system and internal devices. */
500 netdev_linux_create(const struct netdev_class *class,
501 const char *name, const struct shash *args,
502 struct netdev_dev **netdev_devp)
504 struct netdev_dev_linux *netdev_dev;
507 if (!shash_is_empty(args)) {
508 VLOG_WARN("%s: arguments for %s devices should be empty",
512 if (!cache_notifier_refcount) {
513 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
514 netdev_linux_cache_cb, NULL);
519 cache_notifier_refcount++;
521 netdev_dev = xzalloc(sizeof *netdev_dev);
522 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
524 *netdev_devp = &netdev_dev->netdev_dev;
528 /* For most types of netdevs we open the device for each call of
529 * netdev_open(). However, this is not the case with tap devices,
530 * since it is only possible to open the device once. In this
531 * situation we share a single file descriptor, and consequently
532 * buffers, across all readers. Therefore once data is read it will
533 * be unavailable to other reads for tap devices. */
535 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
536 const char *name, const struct shash *args,
537 struct netdev_dev **netdev_devp)
539 struct netdev_dev_linux *netdev_dev;
540 struct tap_state *state;
541 static const char tap_dev[] = "/dev/net/tun";
545 if (!shash_is_empty(args)) {
546 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
549 netdev_dev = xzalloc(sizeof *netdev_dev);
550 state = &netdev_dev->state.tap;
552 /* Open tap device. */
553 state->fd = open(tap_dev, O_RDWR);
556 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
560 /* Create tap device. */
561 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
562 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
563 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
564 VLOG_WARN("%s: creating tap device failed: %s", name,
570 /* Make non-blocking. */
571 error = set_nonblocking(state->fd);
576 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
577 *netdev_devp = &netdev_dev->netdev_dev;
586 destroy_tap(struct netdev_dev_linux *netdev_dev)
588 struct tap_state *state = &netdev_dev->state.tap;
590 if (state->fd >= 0) {
595 /* Destroys the netdev device 'netdev_dev_'. */
597 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
599 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
600 const char *type = netdev_dev_get_type(netdev_dev_);
602 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
603 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
606 if (!strcmp(type, "system")) {
607 cache_notifier_refcount--;
609 if (!cache_notifier_refcount) {
610 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
612 } else if (!strcmp(type, "tap")) {
613 destroy_tap(netdev_dev);
620 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
621 struct netdev **netdevp)
623 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
624 struct netdev_linux *netdev;
625 enum netdev_flags flags;
628 /* Allocate network device. */
629 netdev = xzalloc(sizeof *netdev);
631 netdev_init(&netdev->netdev, netdev_dev_);
633 /* Verify that the device really exists, by attempting to read its flags.
634 * (The flags might be cached, in which case this won't actually do an
637 * Don't do this for "internal" netdevs, though, because those have to be
638 * created as netdev objects before they exist in the kernel, because
639 * creating them in the kernel happens by passing a netdev object to
640 * dpif_port_add(). */
641 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
642 error = netdev_get_flags(&netdev->netdev, &flags);
643 if (error == ENODEV) {
648 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
649 !netdev_dev->state.tap.opened) {
651 /* We assume that the first user of the tap device is the primary user
652 * and give them the tap FD. Subsequent users probably just expect
653 * this to be a system device so open it normally to avoid send/receive
654 * directions appearing to be reversed. */
655 netdev->fd = netdev_dev->state.tap.fd;
656 netdev_dev->state.tap.opened = true;
657 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
658 struct sockaddr_ll sll;
662 /* Create file descriptor. */
663 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
664 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
666 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
667 if (netdev->fd < 0) {
672 /* Set non-blocking mode. */
673 error = set_nonblocking(netdev->fd);
678 /* Get ethernet device index. */
679 error = get_ifindex(&netdev->netdev, &ifindex);
684 /* Bind to specific ethernet device. */
685 memset(&sll, 0, sizeof sll);
686 sll.sll_family = AF_PACKET;
687 sll.sll_ifindex = ifindex;
689 (struct sockaddr *) &sll, sizeof sll) < 0) {
691 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
696 /* Between the socket() and bind() calls above, the socket receives all
697 * packets of the requested type on all system interfaces. We do not
698 * want to receive that data, but there is no way to avoid it. So we
699 * must now drain out the receive queue. */
700 error = drain_rcvbuf(netdev->fd);
706 *netdevp = &netdev->netdev;
710 netdev_uninit(&netdev->netdev, true);
714 /* Closes and destroys 'netdev'. */
716 netdev_linux_close(struct netdev *netdev_)
718 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
720 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
726 /* Initializes 'svec' with a list of the names of all known network devices. */
728 netdev_linux_enumerate(struct svec *svec)
730 struct if_nameindex *names;
732 names = if_nameindex();
736 for (i = 0; names[i].if_name != NULL; i++) {
737 svec_add(svec, names[i].if_name);
739 if_freenameindex(names);
742 VLOG_WARN("could not obtain list of network device names: %s",
749 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
751 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
753 if (netdev->fd < 0) {
754 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
759 ssize_t retval = read(netdev->fd, data, size);
762 } else if (errno != EINTR) {
763 if (errno != EAGAIN) {
764 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
765 strerror(errno), netdev_get_name(netdev_));
772 /* Registers with the poll loop to wake up from the next call to poll_block()
773 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
775 netdev_linux_recv_wait(struct netdev *netdev_)
777 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
778 if (netdev->fd >= 0) {
779 poll_fd_wait(netdev->fd, POLLIN);
783 /* Discards all packets waiting to be received from 'netdev'. */
785 netdev_linux_drain(struct netdev *netdev_)
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
788 if (netdev->fd < 0) {
790 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
792 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
793 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
797 drain_fd(netdev->fd, ifr.ifr_qlen);
800 return drain_rcvbuf(netdev->fd);
804 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
805 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
806 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
807 * the packet is too big or too small to transmit on the device.
809 * The caller retains ownership of 'buffer' in all cases.
811 * The kernel maintains a packet transmission queue, so the caller is not
812 * expected to do additional queuing of packets. */
814 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
818 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
820 if (netdev->fd < 0) {
825 ssize_t retval = write(netdev->fd, data, size);
827 /* The Linux AF_PACKET implementation never blocks waiting for room
828 * for packets, instead returning ENOBUFS. Translate this into
829 * EAGAIN for the caller. */
830 if (errno == ENOBUFS) {
832 } else if (errno == EINTR) {
834 } else if (errno != EAGAIN) {
835 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
836 netdev_get_name(netdev_), strerror(errno));
839 } else if (retval != size) {
840 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
841 "%zu) on %s", retval, size, netdev_get_name(netdev_));
849 /* Registers with the poll loop to wake up from the next call to poll_block()
850 * when the packet transmission queue has sufficient room to transmit a packet
851 * with netdev_send().
853 * The kernel maintains a packet transmission queue, so the client is not
854 * expected to do additional queuing of packets. Thus, this function is
855 * unlikely to ever be used. It is included for completeness. */
857 netdev_linux_send_wait(struct netdev *netdev_)
859 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
860 if (netdev->fd < 0) {
862 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
863 poll_fd_wait(netdev->fd, POLLOUT);
865 /* TAP device always accepts packets.*/
866 poll_immediate_wake();
870 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
871 * otherwise a positive errno value. */
873 netdev_linux_set_etheraddr(struct netdev *netdev_,
874 const uint8_t mac[ETH_ADDR_LEN])
876 struct netdev_dev_linux *netdev_dev =
877 netdev_dev_linux_cast(netdev_get_dev(netdev_));
880 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
881 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
882 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
884 netdev_dev->cache_valid |= VALID_ETHERADDR;
885 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
893 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
894 * free the returned buffer. */
896 netdev_linux_get_etheraddr(const struct netdev *netdev_,
897 uint8_t mac[ETH_ADDR_LEN])
899 struct netdev_dev_linux *netdev_dev =
900 netdev_dev_linux_cast(netdev_get_dev(netdev_));
901 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
902 int error = get_etheraddr(netdev_get_name(netdev_),
903 netdev_dev->etheraddr);
907 netdev_dev->cache_valid |= VALID_ETHERADDR;
909 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
913 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
914 * in bytes, not including the hardware header; thus, this is typically 1500
915 * bytes for Ethernet devices. */
917 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
919 struct netdev_dev_linux *netdev_dev =
920 netdev_dev_linux_cast(netdev_get_dev(netdev_));
921 if (!(netdev_dev->cache_valid & VALID_MTU)) {
925 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
926 SIOCGIFMTU, "SIOCGIFMTU");
930 netdev_dev->mtu = ifr.ifr_mtu;
931 netdev_dev->cache_valid |= VALID_MTU;
933 *mtup = netdev_dev->mtu;
937 /* Returns the ifindex of 'netdev', if successful, as a positive number.
938 * On failure, returns a negative errno value. */
940 netdev_linux_get_ifindex(const struct netdev *netdev)
944 error = get_ifindex(netdev, &ifindex);
945 return error ? -error : ifindex;
949 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
951 struct netdev_dev_linux *netdev_dev =
952 netdev_dev_linux_cast(netdev_get_dev(netdev_));
957 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
961 fn = xasprintf("/sys/class/net/%s/carrier",
962 netdev_get_name(netdev_));
963 fd = open(fn, O_RDONLY);
966 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
970 retval = read(fd, line, sizeof line);
973 if (error == EINVAL) {
974 /* This is the normal return value when we try to check carrier
975 * if the network device is not up. */
977 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
980 } else if (retval == 0) {
982 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
986 if (line[0] != '0' && line[0] != '1') {
988 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
992 netdev_dev->carrier = line[0] != '0';
993 netdev_dev->cache_valid |= VALID_CARRIER;
995 *carrier = netdev_dev->carrier;
1006 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1007 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1010 check_for_working_netlink_stats(void)
1012 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1013 * preferable, so if that works, we'll use it. */
1014 int ifindex = do_get_ifindex("lo");
1016 VLOG_WARN("failed to get ifindex for lo, "
1017 "obtaining netdev stats from proc");
1020 struct netdev_stats stats;
1021 int error = get_stats_via_netlink(ifindex, &stats);
1023 VLOG_DBG("obtaining netdev stats via rtnetlink");
1026 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1027 "via proc (you are probably running a pre-2.6.19 "
1028 "kernel)", strerror(error));
1034 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1036 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1038 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1039 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1040 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1042 netdev_dev->is_tap = !strcmp(type, "tap");
1043 netdev_dev->is_internal = false;
1044 if (!netdev_dev->is_tap) {
1045 struct ethtool_drvinfo drvinfo;
1048 memset(&drvinfo, 0, sizeof drvinfo);
1049 error = netdev_linux_do_ethtool(name,
1050 (struct ethtool_cmd *)&drvinfo,
1052 "ETHTOOL_GDRVINFO");
1054 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1055 netdev_dev->is_internal = true;
1059 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1064 swap_uint64(uint64_t *a, uint64_t *b)
1071 /* Retrieves current device stats for 'netdev'. */
1073 netdev_linux_get_stats(const struct netdev *netdev_,
1074 struct netdev_stats *stats)
1076 struct netdev_dev_linux *netdev_dev =
1077 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1078 static int use_netlink_stats = -1;
1081 if (netdev_dev->have_vport_stats ||
1082 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1084 error = netdev_vport_get_stats(netdev_, stats);
1085 netdev_dev->have_vport_stats = !error;
1086 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1089 if (!netdev_dev->have_vport_stats) {
1090 if (use_netlink_stats < 0) {
1091 use_netlink_stats = check_for_working_netlink_stats();
1093 if (use_netlink_stats) {
1096 error = get_ifindex(netdev_, &ifindex);
1098 error = get_stats_via_netlink(ifindex, stats);
1101 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1105 /* If this port is an internal port then the transmit and receive stats
1106 * will appear to be swapped relative to the other ports since we are the
1107 * one sending the data, not a remote computer. For consistency, we swap
1108 * them back here. This does not apply if we are getting stats from the
1109 * vport layer because it always tracks stats from the perspective of the
1111 netdev_linux_update_is_pseudo(netdev_dev);
1112 if (!error && !netdev_dev->have_vport_stats &&
1113 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1114 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1115 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1116 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1117 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1118 stats->rx_length_errors = 0;
1119 stats->rx_over_errors = 0;
1120 stats->rx_crc_errors = 0;
1121 stats->rx_frame_errors = 0;
1122 stats->rx_fifo_errors = 0;
1123 stats->rx_missed_errors = 0;
1124 stats->tx_aborted_errors = 0;
1125 stats->tx_carrier_errors = 0;
1126 stats->tx_fifo_errors = 0;
1127 stats->tx_heartbeat_errors = 0;
1128 stats->tx_window_errors = 0;
1134 /* Stores the features supported by 'netdev' into each of '*current',
1135 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1136 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1137 * successful, otherwise a positive errno value. */
1139 netdev_linux_get_features(struct netdev *netdev,
1140 uint32_t *current, uint32_t *advertised,
1141 uint32_t *supported, uint32_t *peer)
1143 struct ethtool_cmd ecmd;
1146 memset(&ecmd, 0, sizeof ecmd);
1147 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1148 ETHTOOL_GSET, "ETHTOOL_GSET");
1153 /* Supported features. */
1155 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1156 *supported |= OFPPF_10MB_HD;
1158 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1159 *supported |= OFPPF_10MB_FD;
1161 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1162 *supported |= OFPPF_100MB_HD;
1164 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1165 *supported |= OFPPF_100MB_FD;
1167 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1168 *supported |= OFPPF_1GB_HD;
1170 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1171 *supported |= OFPPF_1GB_FD;
1173 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1174 *supported |= OFPPF_10GB_FD;
1176 if (ecmd.supported & SUPPORTED_TP) {
1177 *supported |= OFPPF_COPPER;
1179 if (ecmd.supported & SUPPORTED_FIBRE) {
1180 *supported |= OFPPF_FIBER;
1182 if (ecmd.supported & SUPPORTED_Autoneg) {
1183 *supported |= OFPPF_AUTONEG;
1185 if (ecmd.supported & SUPPORTED_Pause) {
1186 *supported |= OFPPF_PAUSE;
1188 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1189 *supported |= OFPPF_PAUSE_ASYM;
1192 /* Advertised features. */
1194 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1195 *advertised |= OFPPF_10MB_HD;
1197 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1198 *advertised |= OFPPF_10MB_FD;
1200 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1201 *advertised |= OFPPF_100MB_HD;
1203 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1204 *advertised |= OFPPF_100MB_FD;
1206 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1207 *advertised |= OFPPF_1GB_HD;
1209 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1210 *advertised |= OFPPF_1GB_FD;
1212 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1213 *advertised |= OFPPF_10GB_FD;
1215 if (ecmd.advertising & ADVERTISED_TP) {
1216 *advertised |= OFPPF_COPPER;
1218 if (ecmd.advertising & ADVERTISED_FIBRE) {
1219 *advertised |= OFPPF_FIBER;
1221 if (ecmd.advertising & ADVERTISED_Autoneg) {
1222 *advertised |= OFPPF_AUTONEG;
1224 if (ecmd.advertising & ADVERTISED_Pause) {
1225 *advertised |= OFPPF_PAUSE;
1227 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1228 *advertised |= OFPPF_PAUSE_ASYM;
1231 /* Current settings. */
1232 if (ecmd.speed == SPEED_10) {
1233 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1234 } else if (ecmd.speed == SPEED_100) {
1235 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1236 } else if (ecmd.speed == SPEED_1000) {
1237 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1238 } else if (ecmd.speed == SPEED_10000) {
1239 *current = OFPPF_10GB_FD;
1244 if (ecmd.port == PORT_TP) {
1245 *current |= OFPPF_COPPER;
1246 } else if (ecmd.port == PORT_FIBRE) {
1247 *current |= OFPPF_FIBER;
1251 *current |= OFPPF_AUTONEG;
1254 /* Peer advertisements. */
1255 *peer = 0; /* XXX */
1260 /* Set the features advertised by 'netdev' to 'advertise'. */
1262 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1264 struct ethtool_cmd ecmd;
1267 memset(&ecmd, 0, sizeof ecmd);
1268 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1269 ETHTOOL_GSET, "ETHTOOL_GSET");
1274 ecmd.advertising = 0;
1275 if (advertise & OFPPF_10MB_HD) {
1276 ecmd.advertising |= ADVERTISED_10baseT_Half;
1278 if (advertise & OFPPF_10MB_FD) {
1279 ecmd.advertising |= ADVERTISED_10baseT_Full;
1281 if (advertise & OFPPF_100MB_HD) {
1282 ecmd.advertising |= ADVERTISED_100baseT_Half;
1284 if (advertise & OFPPF_100MB_FD) {
1285 ecmd.advertising |= ADVERTISED_100baseT_Full;
1287 if (advertise & OFPPF_1GB_HD) {
1288 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1290 if (advertise & OFPPF_1GB_FD) {
1291 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1293 if (advertise & OFPPF_10GB_FD) {
1294 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1296 if (advertise & OFPPF_COPPER) {
1297 ecmd.advertising |= ADVERTISED_TP;
1299 if (advertise & OFPPF_FIBER) {
1300 ecmd.advertising |= ADVERTISED_FIBRE;
1302 if (advertise & OFPPF_AUTONEG) {
1303 ecmd.advertising |= ADVERTISED_Autoneg;
1305 if (advertise & OFPPF_PAUSE) {
1306 ecmd.advertising |= ADVERTISED_Pause;
1308 if (advertise & OFPPF_PAUSE_ASYM) {
1309 ecmd.advertising |= ADVERTISED_Asym_Pause;
1311 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1312 ETHTOOL_SSET, "ETHTOOL_SSET");
1315 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1316 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1317 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1318 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1319 * sets '*vlan_vid' to -1. */
1321 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1323 const char *netdev_name = netdev_get_name(netdev);
1324 struct ds line = DS_EMPTY_INITIALIZER;
1325 FILE *stream = NULL;
1329 COVERAGE_INC(netdev_get_vlan_vid);
1330 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1331 stream = fopen(fn, "r");
1337 if (ds_get_line(&line, stream)) {
1338 if (ferror(stream)) {
1340 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1343 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1348 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1350 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1351 fn, ds_cstr(&line));
1369 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1370 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1372 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1373 * positive errno value.
1375 * This function is equivalent to running
1376 * /sbin/tc qdisc del dev %s handle ffff: ingress
1377 * but it is much, much faster.
1380 netdev_linux_remove_policing(struct netdev *netdev)
1382 struct netdev_dev_linux *netdev_dev =
1383 netdev_dev_linux_cast(netdev_get_dev(netdev));
1384 const char *netdev_name = netdev_get_name(netdev);
1386 struct ofpbuf request;
1387 struct tcmsg *tcmsg;
1390 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1394 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1395 tcmsg->tcm_parent = TC_H_INGRESS;
1396 nl_msg_put_string(&request, TCA_KIND, "ingress");
1397 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1399 error = tc_transact(&request, NULL);
1400 if (error && error != ENOENT && error != EINVAL) {
1401 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1402 netdev_name, strerror(error));
1406 netdev_dev->kbits_rate = 0;
1407 netdev_dev->kbits_burst = 0;
1408 netdev_dev->cache_valid |= VALID_POLICING;
1412 /* Attempts to set input rate limiting (policing) policy. */
1414 netdev_linux_set_policing(struct netdev *netdev,
1415 uint32_t kbits_rate, uint32_t kbits_burst)
1417 struct netdev_dev_linux *netdev_dev =
1418 netdev_dev_linux_cast(netdev_get_dev(netdev));
1419 const char *netdev_name = netdev_get_name(netdev);
1422 COVERAGE_INC(netdev_set_policing);
1424 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1425 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1426 : kbits_burst); /* Stick with user-specified value. */
1428 if (netdev_dev->cache_valid & VALID_POLICING
1429 && netdev_dev->kbits_rate == kbits_rate
1430 && netdev_dev->kbits_burst == kbits_burst) {
1431 /* Assume that settings haven't changed since we last set them. */
1435 netdev_linux_remove_policing(netdev);
1437 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1438 if (system(command) != 0) {
1439 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1443 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1444 kbits_rate, kbits_burst);
1445 if (system(command) != 0) {
1446 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1451 netdev_dev->kbits_rate = kbits_rate;
1452 netdev_dev->kbits_burst = kbits_burst;
1453 netdev_dev->cache_valid |= VALID_POLICING;
1460 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1463 const struct tc_ops **opsp;
1465 for (opsp = tcs; *opsp != NULL; opsp++) {
1466 const struct tc_ops *ops = *opsp;
1467 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1468 svec_add(types, ops->ovs_name);
1474 static const struct tc_ops *
1475 tc_lookup_ovs_name(const char *name)
1477 const struct tc_ops **opsp;
1479 for (opsp = tcs; *opsp != NULL; opsp++) {
1480 const struct tc_ops *ops = *opsp;
1481 if (!strcmp(name, ops->ovs_name)) {
1488 static const struct tc_ops *
1489 tc_lookup_linux_name(const char *name)
1491 const struct tc_ops **opsp;
1493 for (opsp = tcs; *opsp != NULL; opsp++) {
1494 const struct tc_ops *ops = *opsp;
1495 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1502 static struct tc_queue *
1503 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1506 struct netdev_dev_linux *netdev_dev =
1507 netdev_dev_linux_cast(netdev_get_dev(netdev));
1508 struct tc_queue *queue;
1510 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1511 if (queue->queue_id == queue_id) {
1518 static struct tc_queue *
1519 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1521 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1525 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1527 struct netdev_qos_capabilities *caps)
1529 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1533 caps->n_queues = ops->n_queues;
1538 netdev_linux_get_qos(const struct netdev *netdev,
1539 const char **typep, struct shash *details)
1541 struct netdev_dev_linux *netdev_dev =
1542 netdev_dev_linux_cast(netdev_get_dev(netdev));
1545 error = tc_query_qdisc(netdev);
1550 *typep = netdev_dev->tc->ops->ovs_name;
1551 return (netdev_dev->tc->ops->qdisc_get
1552 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1557 netdev_linux_set_qos(struct netdev *netdev,
1558 const char *type, const struct shash *details)
1560 struct netdev_dev_linux *netdev_dev =
1561 netdev_dev_linux_cast(netdev_get_dev(netdev));
1562 const struct tc_ops *new_ops;
1565 new_ops = tc_lookup_ovs_name(type);
1566 if (!new_ops || !new_ops->tc_install) {
1570 error = tc_query_qdisc(netdev);
1575 if (new_ops == netdev_dev->tc->ops) {
1576 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1578 /* Delete existing qdisc. */
1579 error = tc_del_qdisc(netdev);
1583 assert(netdev_dev->tc == NULL);
1585 /* Install new qdisc. */
1586 error = new_ops->tc_install(netdev, details);
1587 assert((error == 0) == (netdev_dev->tc != NULL));
1594 netdev_linux_get_queue(const struct netdev *netdev,
1595 unsigned int queue_id, struct shash *details)
1597 struct netdev_dev_linux *netdev_dev =
1598 netdev_dev_linux_cast(netdev_get_dev(netdev));
1601 error = tc_query_qdisc(netdev);
1605 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1607 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1613 netdev_linux_set_queue(struct netdev *netdev,
1614 unsigned int queue_id, const struct shash *details)
1616 struct netdev_dev_linux *netdev_dev =
1617 netdev_dev_linux_cast(netdev_get_dev(netdev));
1620 error = tc_query_qdisc(netdev);
1623 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1624 || !netdev_dev->tc->ops->class_set) {
1628 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1632 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1634 struct netdev_dev_linux *netdev_dev =
1635 netdev_dev_linux_cast(netdev_get_dev(netdev));
1638 error = tc_query_qdisc(netdev);
1641 } else if (!netdev_dev->tc->ops->class_delete) {
1644 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1646 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1652 netdev_linux_get_queue_stats(const struct netdev *netdev,
1653 unsigned int queue_id,
1654 struct netdev_queue_stats *stats)
1656 struct netdev_dev_linux *netdev_dev =
1657 netdev_dev_linux_cast(netdev_get_dev(netdev));
1660 error = tc_query_qdisc(netdev);
1663 } else if (!netdev_dev->tc->ops->class_get_stats) {
1666 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1668 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1674 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1676 struct ofpbuf request;
1677 struct tcmsg *tcmsg;
1679 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1683 tcmsg->tcm_parent = 0;
1684 nl_dump_start(dump, rtnl_sock, &request);
1685 ofpbuf_uninit(&request);
1690 netdev_linux_dump_queues(const struct netdev *netdev,
1691 netdev_dump_queues_cb *cb, void *aux)
1693 struct netdev_dev_linux *netdev_dev =
1694 netdev_dev_linux_cast(netdev_get_dev(netdev));
1695 struct tc_queue *queue;
1696 struct shash details;
1700 error = tc_query_qdisc(netdev);
1703 } else if (!netdev_dev->tc->ops->class_get) {
1708 shash_init(&details);
1709 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1710 shash_clear(&details);
1712 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1714 (*cb)(queue->queue_id, &details, aux);
1719 shash_destroy(&details);
1725 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1726 netdev_dump_queue_stats_cb *cb, void *aux)
1728 struct netdev_dev_linux *netdev_dev =
1729 netdev_dev_linux_cast(netdev_get_dev(netdev));
1730 struct nl_dump dump;
1735 error = tc_query_qdisc(netdev);
1738 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1743 if (!start_queue_dump(netdev, &dump)) {
1746 while (nl_dump_next(&dump, &msg)) {
1747 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1753 error = nl_dump_done(&dump);
1754 return error ? error : last_error;
1758 netdev_linux_get_in4(const struct netdev *netdev_,
1759 struct in_addr *address, struct in_addr *netmask)
1761 struct netdev_dev_linux *netdev_dev =
1762 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1764 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1767 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1768 SIOCGIFADDR, "SIOCGIFADDR");
1773 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1774 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1779 netdev_dev->cache_valid |= VALID_IN4;
1781 *address = netdev_dev->address;
1782 *netmask = netdev_dev->netmask;
1783 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1787 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1788 struct in_addr netmask)
1790 struct netdev_dev_linux *netdev_dev =
1791 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1794 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1796 netdev_dev->cache_valid |= VALID_IN4;
1797 netdev_dev->address = address;
1798 netdev_dev->netmask = netmask;
1799 if (address.s_addr != INADDR_ANY) {
1800 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1801 "SIOCSIFNETMASK", netmask);
1808 parse_if_inet6_line(const char *line,
1809 struct in6_addr *in6, char ifname[16 + 1])
1811 uint8_t *s6 = in6->s6_addr;
1812 #define X8 "%2"SCNx8
1814 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1815 "%*x %*x %*x %*x %16s\n",
1816 &s6[0], &s6[1], &s6[2], &s6[3],
1817 &s6[4], &s6[5], &s6[6], &s6[7],
1818 &s6[8], &s6[9], &s6[10], &s6[11],
1819 &s6[12], &s6[13], &s6[14], &s6[15],
1823 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1824 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1826 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1828 struct netdev_dev_linux *netdev_dev =
1829 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1830 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1834 netdev_dev->in6 = in6addr_any;
1836 file = fopen("/proc/net/if_inet6", "r");
1838 const char *name = netdev_get_name(netdev_);
1839 while (fgets(line, sizeof line, file)) {
1840 struct in6_addr in6_tmp;
1841 char ifname[16 + 1];
1842 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1843 && !strcmp(name, ifname))
1845 netdev_dev->in6 = in6_tmp;
1851 netdev_dev->cache_valid |= VALID_IN6;
1853 *in6 = netdev_dev->in6;
1858 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1860 struct sockaddr_in sin;
1861 memset(&sin, 0, sizeof sin);
1862 sin.sin_family = AF_INET;
1863 sin.sin_addr = addr;
1866 memset(sa, 0, sizeof *sa);
1867 memcpy(sa, &sin, sizeof sin);
1871 do_set_addr(struct netdev *netdev,
1872 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1875 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1876 make_in4_sockaddr(&ifr.ifr_addr, addr);
1878 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1882 /* Adds 'router' as a default IP gateway. */
1884 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1886 struct in_addr any = { INADDR_ANY };
1890 memset(&rt, 0, sizeof rt);
1891 make_in4_sockaddr(&rt.rt_dst, any);
1892 make_in4_sockaddr(&rt.rt_gateway, router);
1893 make_in4_sockaddr(&rt.rt_genmask, any);
1894 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1895 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1897 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1903 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1906 static const char fn[] = "/proc/net/route";
1911 *netdev_name = NULL;
1912 stream = fopen(fn, "r");
1913 if (stream == NULL) {
1914 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1919 while (fgets(line, sizeof line, stream)) {
1922 uint32_t dest, gateway, mask;
1923 int refcnt, metric, mtu;
1924 unsigned int flags, use, window, irtt;
1927 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1929 iface, &dest, &gateway, &flags, &refcnt,
1930 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1932 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1936 if (!(flags & RTF_UP)) {
1937 /* Skip routes that aren't up. */
1941 /* The output of 'dest', 'mask', and 'gateway' were given in
1942 * network byte order, so we don't need need any endian
1943 * conversions here. */
1944 if ((dest & mask) == (host->s_addr & mask)) {
1946 /* The host is directly reachable. */
1947 next_hop->s_addr = 0;
1949 /* To reach the host, we must go through a gateway. */
1950 next_hop->s_addr = gateway;
1952 *netdev_name = xstrdup(iface);
1963 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1964 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1965 * returns 0. Otherwise, it returns a positive errno value; in particular,
1966 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1968 netdev_linux_arp_lookup(const struct netdev *netdev,
1969 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1972 struct sockaddr_in sin;
1975 memset(&r, 0, sizeof r);
1976 sin.sin_family = AF_INET;
1977 sin.sin_addr.s_addr = ip;
1979 memcpy(&r.arp_pa, &sin, sizeof sin);
1980 r.arp_ha.sa_family = ARPHRD_ETHER;
1982 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1983 COVERAGE_INC(netdev_arp_lookup);
1984 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1986 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1987 } else if (retval != ENXIO) {
1988 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1989 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1995 nd_to_iff_flags(enum netdev_flags nd)
1998 if (nd & NETDEV_UP) {
2001 if (nd & NETDEV_PROMISC) {
2008 iff_to_nd_flags(int iff)
2010 enum netdev_flags nd = 0;
2014 if (iff & IFF_PROMISC) {
2015 nd |= NETDEV_PROMISC;
2021 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2022 enum netdev_flags on, enum netdev_flags *old_flagsp)
2024 int old_flags, new_flags;
2027 error = get_flags(netdev, &old_flags);
2029 *old_flagsp = iff_to_nd_flags(old_flags);
2030 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2031 if (new_flags != old_flags) {
2032 error = set_flags(netdev, new_flags);
2039 poll_notify(struct list *list)
2041 struct netdev_linux_notifier *notifier;
2042 LIST_FOR_EACH (notifier, node, list) {
2043 struct netdev_notifier *n = ¬ifier->notifier;
2049 netdev_linux_poll_cb(const struct rtnetlink_change *change,
2050 void *aux OVS_UNUSED)
2053 struct list *list = shash_find_data(&netdev_linux_notifiers,
2059 struct shash_node *node;
2060 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2061 poll_notify(node->data);
2067 netdev_linux_poll_add(struct netdev *netdev,
2068 void (*cb)(struct netdev_notifier *), void *aux,
2069 struct netdev_notifier **notifierp)
2071 const char *netdev_name = netdev_get_name(netdev);
2072 struct netdev_linux_notifier *notifier;
2075 if (shash_is_empty(&netdev_linux_notifiers)) {
2076 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2077 netdev_linux_poll_cb, NULL);
2083 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2085 list = xmalloc(sizeof *list);
2087 shash_add(&netdev_linux_notifiers, netdev_name, list);
2090 notifier = xmalloc(sizeof *notifier);
2091 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2092 list_push_back(list, ¬ifier->node);
2093 *notifierp = ¬ifier->notifier;
2098 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2100 struct netdev_linux_notifier *notifier =
2101 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2104 /* Remove 'notifier' from its list. */
2105 list = list_remove(¬ifier->node);
2106 if (list_is_empty(list)) {
2107 /* The list is now empty. Remove it from the hash and free it. */
2108 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2109 shash_delete(&netdev_linux_notifiers,
2110 shash_find(&netdev_linux_notifiers, netdev_name));
2115 /* If that was the last notifier, unregister. */
2116 if (shash_is_empty(&netdev_linux_notifiers)) {
2117 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2121 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2125 netdev_linux_init, \
2127 netdev_linux_wait, \
2130 netdev_linux_destroy, \
2131 NULL, /* reconfigure */ \
2133 netdev_linux_open, \
2134 netdev_linux_close, \
2138 netdev_linux_recv, \
2139 netdev_linux_recv_wait, \
2140 netdev_linux_drain, \
2142 netdev_linux_send, \
2143 netdev_linux_send_wait, \
2145 netdev_linux_set_etheraddr, \
2146 netdev_linux_get_etheraddr, \
2147 netdev_linux_get_mtu, \
2148 netdev_linux_get_ifindex, \
2149 netdev_linux_get_carrier, \
2150 netdev_linux_get_stats, \
2153 netdev_linux_get_features, \
2154 netdev_linux_set_advertisements, \
2155 netdev_linux_get_vlan_vid, \
2157 netdev_linux_set_policing, \
2158 netdev_linux_get_qos_types, \
2159 netdev_linux_get_qos_capabilities, \
2160 netdev_linux_get_qos, \
2161 netdev_linux_set_qos, \
2162 netdev_linux_get_queue, \
2163 netdev_linux_set_queue, \
2164 netdev_linux_delete_queue, \
2165 netdev_linux_get_queue_stats, \
2166 netdev_linux_dump_queues, \
2167 netdev_linux_dump_queue_stats, \
2169 netdev_linux_get_in4, \
2170 netdev_linux_set_in4, \
2171 netdev_linux_get_in6, \
2172 netdev_linux_add_router, \
2173 netdev_linux_get_next_hop, \
2174 netdev_linux_arp_lookup, \
2176 netdev_linux_update_flags, \
2178 netdev_linux_poll_add, \
2179 netdev_linux_poll_remove \
2182 const struct netdev_class netdev_linux_class =
2185 netdev_linux_create,
2186 netdev_linux_enumerate,
2187 NULL); /* set_stats */
2189 const struct netdev_class netdev_tap_class =
2192 netdev_linux_create_tap,
2193 NULL, /* enumerate */
2194 NULL); /* set_stats */
2196 const struct netdev_class netdev_internal_class =
2199 netdev_linux_create,
2200 NULL, /* enumerate */
2201 netdev_vport_set_stats);
2203 /* HTB traffic control class. */
2205 #define HTB_N_QUEUES 0xf000
2209 unsigned int max_rate; /* In bytes/s. */
2213 struct tc_queue tc_queue;
2214 unsigned int min_rate; /* In bytes/s. */
2215 unsigned int max_rate; /* In bytes/s. */
2216 unsigned int burst; /* In bytes. */
2217 unsigned int priority; /* Lower values are higher priorities. */
2221 htb_get__(const struct netdev *netdev)
2223 struct netdev_dev_linux *netdev_dev =
2224 netdev_dev_linux_cast(netdev_get_dev(netdev));
2225 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2229 htb_install__(struct netdev *netdev, uint64_t max_rate)
2231 struct netdev_dev_linux *netdev_dev =
2232 netdev_dev_linux_cast(netdev_get_dev(netdev));
2235 htb = xmalloc(sizeof *htb);
2236 tc_init(&htb->tc, &tc_ops_htb);
2237 htb->max_rate = max_rate;
2239 netdev_dev->tc = &htb->tc;
2244 /* Create an HTB qdisc.
2246 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2248 htb_setup_qdisc__(struct netdev *netdev)
2251 struct tc_htb_glob opt;
2252 struct ofpbuf request;
2253 struct tcmsg *tcmsg;
2255 tc_del_qdisc(netdev);
2257 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2258 NLM_F_EXCL | NLM_F_CREATE, &request);
2262 tcmsg->tcm_handle = tc_make_handle(1, 0);
2263 tcmsg->tcm_parent = TC_H_ROOT;
2265 nl_msg_put_string(&request, TCA_KIND, "htb");
2267 memset(&opt, 0, sizeof opt);
2268 opt.rate2quantum = 10;
2272 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2273 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2274 nl_msg_end_nested(&request, opt_offset);
2276 return tc_transact(&request, NULL);
2279 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2280 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2282 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2283 unsigned int parent, struct htb_class *class)
2286 struct tc_htb_opt opt;
2287 struct ofpbuf request;
2288 struct tcmsg *tcmsg;
2292 netdev_get_mtu(netdev, &mtu);
2294 memset(&opt, 0, sizeof opt);
2295 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2296 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2297 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2298 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2299 opt.prio = class->priority;
2301 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2305 tcmsg->tcm_handle = handle;
2306 tcmsg->tcm_parent = parent;
2308 nl_msg_put_string(&request, TCA_KIND, "htb");
2309 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2310 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2311 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2312 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2313 nl_msg_end_nested(&request, opt_offset);
2315 error = tc_transact(&request, NULL);
2317 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2318 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2319 netdev_get_name(netdev),
2320 tc_get_major(handle), tc_get_minor(handle),
2321 tc_get_major(parent), tc_get_minor(parent),
2322 class->min_rate, class->max_rate,
2323 class->burst, class->priority, strerror(error));
2328 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2329 * description of them into 'details'. The description complies with the
2330 * specification given in the vswitch database documentation for linux-htb
2333 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2335 static const struct nl_policy tca_htb_policy[] = {
2336 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2337 .min_len = sizeof(struct tc_htb_opt) },
2340 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2341 const struct tc_htb_opt *htb;
2343 if (!nl_parse_nested(nl_options, tca_htb_policy,
2344 attrs, ARRAY_SIZE(tca_htb_policy))) {
2345 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2349 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2350 class->min_rate = htb->rate.rate;
2351 class->max_rate = htb->ceil.rate;
2352 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2353 class->priority = htb->prio;
2358 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2359 struct htb_class *options,
2360 struct netdev_queue_stats *stats)
2362 struct nlattr *nl_options;
2363 unsigned int handle;
2366 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2367 if (!error && queue_id) {
2368 unsigned int major = tc_get_major(handle);
2369 unsigned int minor = tc_get_minor(handle);
2370 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2371 *queue_id = minor - 1;
2376 if (!error && options) {
2377 error = htb_parse_tca_options__(nl_options, options);
2383 htb_parse_qdisc_details__(struct netdev *netdev,
2384 const struct shash *details, struct htb_class *hc)
2386 const char *max_rate_s;
2388 max_rate_s = shash_find_data(details, "max-rate");
2389 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2390 if (!hc->max_rate) {
2393 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2394 hc->max_rate = netdev_features_to_bps(current) / 8;
2396 hc->min_rate = hc->max_rate;
2402 htb_parse_class_details__(struct netdev *netdev,
2403 const struct shash *details, struct htb_class *hc)
2405 const struct htb *htb = htb_get__(netdev);
2406 const char *min_rate_s = shash_find_data(details, "min-rate");
2407 const char *max_rate_s = shash_find_data(details, "max-rate");
2408 const char *burst_s = shash_find_data(details, "burst");
2409 const char *priority_s = shash_find_data(details, "priority");
2412 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2414 /* min-rate is required. */
2417 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2418 hc->min_rate = MAX(hc->min_rate, 1500);
2419 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2422 hc->max_rate = (max_rate_s
2423 ? strtoull(max_rate_s, NULL, 10) / 8
2425 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2426 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2430 * According to hints in the documentation that I've read, it is important
2431 * that 'burst' be at least as big as the largest frame that might be
2432 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2433 * but having it a bit too small is a problem. Since netdev_get_mtu()
2434 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2435 * the MTU. We actually add 64, instead of 14, as a guard against
2436 * additional headers get tacked on somewhere that we're not aware of. */
2437 netdev_get_mtu(netdev, &mtu);
2438 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2439 hc->burst = MAX(hc->burst, mtu + 64);
2442 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2448 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2449 unsigned int parent, struct htb_class *options,
2450 struct netdev_queue_stats *stats)
2452 struct ofpbuf *reply;
2455 error = tc_query_class(netdev, handle, parent, &reply);
2457 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2458 ofpbuf_delete(reply);
2464 htb_tc_install(struct netdev *netdev, const struct shash *details)
2468 error = htb_setup_qdisc__(netdev);
2470 struct htb_class hc;
2472 htb_parse_qdisc_details__(netdev, details, &hc);
2473 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2474 tc_make_handle(1, 0), &hc);
2476 htb_install__(netdev, hc.max_rate);
2482 static struct htb_class *
2483 htb_class_cast__(const struct tc_queue *queue)
2485 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2489 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2490 const struct htb_class *hc)
2492 struct htb *htb = htb_get__(netdev);
2493 size_t hash = hash_int(queue_id, 0);
2494 struct tc_queue *queue;
2495 struct htb_class *hcp;
2497 queue = tc_find_queue__(netdev, queue_id, hash);
2499 hcp = htb_class_cast__(queue);
2501 hcp = xmalloc(sizeof *hcp);
2502 queue = &hcp->tc_queue;
2503 queue->queue_id = queue_id;
2504 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2507 hcp->min_rate = hc->min_rate;
2508 hcp->max_rate = hc->max_rate;
2509 hcp->burst = hc->burst;
2510 hcp->priority = hc->priority;
2514 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2517 struct nl_dump dump;
2518 struct htb_class hc;
2521 /* Get qdisc options. */
2523 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2524 htb = htb_install__(netdev, hc.max_rate);
2527 if (!start_queue_dump(netdev, &dump)) {
2530 while (nl_dump_next(&dump, &msg)) {
2531 unsigned int queue_id;
2533 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2534 htb_update_queue__(netdev, queue_id, &hc);
2537 nl_dump_done(&dump);
2543 htb_tc_destroy(struct tc *tc)
2545 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2546 struct htb_class *hc, *next;
2548 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2549 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2557 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2559 const struct htb *htb = htb_get__(netdev);
2560 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2565 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2567 struct htb_class hc;
2570 htb_parse_qdisc_details__(netdev, details, &hc);
2571 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2572 tc_make_handle(1, 0), &hc);
2574 htb_get__(netdev)->max_rate = hc.max_rate;
2580 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2581 const struct tc_queue *queue, struct shash *details)
2583 const struct htb_class *hc = htb_class_cast__(queue);
2585 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2586 if (hc->min_rate != hc->max_rate) {
2587 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2589 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2591 shash_add(details, "priority", xasprintf("%u", hc->priority));
2597 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2598 const struct shash *details)
2600 struct htb_class hc;
2603 error = htb_parse_class_details__(netdev, details, &hc);
2608 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2609 tc_make_handle(1, 0xfffe), &hc);
2614 htb_update_queue__(netdev, queue_id, &hc);
2619 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2621 struct htb_class *hc = htb_class_cast__(queue);
2622 struct htb *htb = htb_get__(netdev);
2625 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2627 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2634 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2635 struct netdev_queue_stats *stats)
2637 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2638 tc_make_handle(1, 0xfffe), NULL, stats);
2642 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2643 const struct ofpbuf *nlmsg,
2644 netdev_dump_queue_stats_cb *cb, void *aux)
2646 struct netdev_queue_stats stats;
2647 unsigned int handle, major, minor;
2650 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2655 major = tc_get_major(handle);
2656 minor = tc_get_minor(handle);
2657 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2658 (*cb)(minor - 1, &stats, aux);
2663 static const struct tc_ops tc_ops_htb = {
2664 "htb", /* linux_name */
2665 "linux-htb", /* ovs_name */
2666 HTB_N_QUEUES, /* n_queues */
2675 htb_class_get_stats,
2676 htb_class_dump_stats
2679 /* "linux-hfsc" traffic control class. */
2681 #define HFSC_N_QUEUES 0xf000
2689 struct tc_queue tc_queue;
2694 static struct hfsc *
2695 hfsc_get__(const struct netdev *netdev)
2697 struct netdev_dev_linux *netdev_dev;
2698 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2699 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2702 static struct hfsc_class *
2703 hfsc_class_cast__(const struct tc_queue *queue)
2705 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2708 static struct hfsc *
2709 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2711 struct netdev_dev_linux * netdev_dev;
2714 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2715 hfsc = xmalloc(sizeof *hfsc);
2716 tc_init(&hfsc->tc, &tc_ops_hfsc);
2717 hfsc->max_rate = max_rate;
2718 netdev_dev->tc = &hfsc->tc;
2724 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2725 const struct hfsc_class *hc)
2729 struct hfsc_class *hcp;
2730 struct tc_queue *queue;
2732 hfsc = hfsc_get__(netdev);
2733 hash = hash_int(queue_id, 0);
2735 queue = tc_find_queue__(netdev, queue_id, hash);
2737 hcp = hfsc_class_cast__(queue);
2739 hcp = xmalloc(sizeof *hcp);
2740 queue = &hcp->tc_queue;
2741 queue->queue_id = queue_id;
2742 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2745 hcp->min_rate = hc->min_rate;
2746 hcp->max_rate = hc->max_rate;
2750 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2752 const struct tc_service_curve *rsc, *fsc, *usc;
2753 static const struct nl_policy tca_hfsc_policy[] = {
2755 .type = NL_A_UNSPEC,
2757 .min_len = sizeof(struct tc_service_curve),
2760 .type = NL_A_UNSPEC,
2762 .min_len = sizeof(struct tc_service_curve),
2765 .type = NL_A_UNSPEC,
2767 .min_len = sizeof(struct tc_service_curve),
2770 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2772 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2773 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2774 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2778 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2779 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2780 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2782 if (rsc->m1 != 0 || rsc->d != 0 ||
2783 fsc->m1 != 0 || fsc->d != 0 ||
2784 usc->m1 != 0 || usc->d != 0) {
2785 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2786 "Non-linear service curves are not supported.");
2790 if (rsc->m2 != fsc->m2) {
2791 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2792 "Real-time service curves are not supported ");
2796 if (rsc->m2 > usc->m2) {
2797 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2798 "Min-rate service curve is greater than "
2799 "the max-rate service curve.");
2803 class->min_rate = fsc->m2;
2804 class->max_rate = usc->m2;
2809 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2810 struct hfsc_class *options,
2811 struct netdev_queue_stats *stats)
2814 unsigned int handle;
2815 struct nlattr *nl_options;
2817 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2823 unsigned int major, minor;
2825 major = tc_get_major(handle);
2826 minor = tc_get_minor(handle);
2827 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2828 *queue_id = minor - 1;
2835 error = hfsc_parse_tca_options__(nl_options, options);
2842 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2843 unsigned int parent, struct hfsc_class *options,
2844 struct netdev_queue_stats *stats)
2847 struct ofpbuf *reply;
2849 error = tc_query_class(netdev, handle, parent, &reply);
2854 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2855 ofpbuf_delete(reply);
2860 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2861 struct hfsc_class *class)
2864 const char *max_rate_s;
2866 max_rate_s = shash_find_data(details, "max-rate");
2867 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2872 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2873 max_rate = netdev_features_to_bps(current) / 8;
2876 class->min_rate = max_rate;
2877 class->max_rate = max_rate;
2881 hfsc_parse_class_details__(struct netdev *netdev,
2882 const struct shash *details,
2883 struct hfsc_class * class)
2885 const struct hfsc *hfsc;
2886 uint32_t min_rate, max_rate;
2887 const char *min_rate_s, *max_rate_s;
2889 hfsc = hfsc_get__(netdev);
2890 min_rate_s = shash_find_data(details, "min-rate");
2891 max_rate_s = shash_find_data(details, "max-rate");
2897 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2898 min_rate = MAX(min_rate, 1500);
2899 min_rate = MIN(min_rate, hfsc->max_rate);
2901 max_rate = (max_rate_s
2902 ? strtoull(max_rate_s, NULL, 10) / 8
2904 max_rate = MAX(max_rate, min_rate);
2905 max_rate = MIN(max_rate, hfsc->max_rate);
2907 class->min_rate = min_rate;
2908 class->max_rate = max_rate;
2913 /* Create an HFSC qdisc.
2915 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2917 hfsc_setup_qdisc__(struct netdev * netdev)
2919 struct tcmsg *tcmsg;
2920 struct ofpbuf request;
2921 struct tc_hfsc_qopt opt;
2923 tc_del_qdisc(netdev);
2925 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2926 NLM_F_EXCL | NLM_F_CREATE, &request);
2932 tcmsg->tcm_handle = tc_make_handle(1, 0);
2933 tcmsg->tcm_parent = TC_H_ROOT;
2935 memset(&opt, 0, sizeof opt);
2938 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2939 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2941 return tc_transact(&request, NULL);
2944 /* Create an HFSC class.
2946 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2947 * sc rate <min_rate> ul rate <max_rate>" */
2949 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
2950 unsigned int parent, struct hfsc_class *class)
2954 struct tcmsg *tcmsg;
2955 struct ofpbuf request;
2956 struct tc_service_curve min, max;
2958 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2964 tcmsg->tcm_handle = handle;
2965 tcmsg->tcm_parent = parent;
2969 min.m2 = class->min_rate;
2973 max.m2 = class->max_rate;
2975 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2976 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2977 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
2978 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
2979 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
2980 nl_msg_end_nested(&request, opt_offset);
2982 error = tc_transact(&request, NULL);
2984 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2985 "min-rate %ubps, max-rate %ubps (%s)",
2986 netdev_get_name(netdev),
2987 tc_get_major(handle), tc_get_minor(handle),
2988 tc_get_major(parent), tc_get_minor(parent),
2989 class->min_rate, class->max_rate, strerror(error));
2996 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
2999 struct hfsc_class class;
3001 error = hfsc_setup_qdisc__(netdev);
3007 hfsc_parse_qdisc_details__(netdev, details, &class);
3008 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3009 tc_make_handle(1, 0), &class);
3015 hfsc_install__(netdev, class.max_rate);
3020 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3024 struct nl_dump dump;
3025 struct hfsc_class hc;
3028 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3029 hfsc = hfsc_install__(netdev, hc.max_rate);
3031 if (!start_queue_dump(netdev, &dump)) {
3035 while (nl_dump_next(&dump, &msg)) {
3036 unsigned int queue_id;
3038 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3039 hfsc_update_queue__(netdev, queue_id, &hc);
3043 nl_dump_done(&dump);
3048 hfsc_tc_destroy(struct tc *tc)
3051 struct hfsc_class *hc, *next;
3053 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3055 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3056 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3065 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3067 const struct hfsc *hfsc;
3068 hfsc = hfsc_get__(netdev);
3069 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3074 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3077 struct hfsc_class class;
3079 hfsc_parse_qdisc_details__(netdev, details, &class);
3080 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3081 tc_make_handle(1, 0), &class);
3084 hfsc_get__(netdev)->max_rate = class.max_rate;
3091 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3092 const struct tc_queue *queue, struct shash *details)
3094 const struct hfsc_class *hc;
3096 hc = hfsc_class_cast__(queue);
3097 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3098 if (hc->min_rate != hc->max_rate) {
3099 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3105 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3106 const struct shash *details)
3109 struct hfsc_class class;
3111 error = hfsc_parse_class_details__(netdev, details, &class);
3116 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3117 tc_make_handle(1, 0xfffe), &class);
3122 hfsc_update_queue__(netdev, queue_id, &class);
3127 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3131 struct hfsc_class *hc;
3133 hc = hfsc_class_cast__(queue);
3134 hfsc = hfsc_get__(netdev);
3136 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3138 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3145 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3146 struct netdev_queue_stats *stats)
3148 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3149 tc_make_handle(1, 0xfffe), NULL, stats);
3153 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3154 const struct ofpbuf *nlmsg,
3155 netdev_dump_queue_stats_cb *cb, void *aux)
3157 struct netdev_queue_stats stats;
3158 unsigned int handle, major, minor;
3161 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3166 major = tc_get_major(handle);
3167 minor = tc_get_minor(handle);
3168 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3169 (*cb)(minor - 1, &stats, aux);
3174 static const struct tc_ops tc_ops_hfsc = {
3175 "hfsc", /* linux_name */
3176 "linux-hfsc", /* ovs_name */
3177 HFSC_N_QUEUES, /* n_queues */
3178 hfsc_tc_install, /* tc_install */
3179 hfsc_tc_load, /* tc_load */
3180 hfsc_tc_destroy, /* tc_destroy */
3181 hfsc_qdisc_get, /* qdisc_get */
3182 hfsc_qdisc_set, /* qdisc_set */
3183 hfsc_class_get, /* class_get */
3184 hfsc_class_set, /* class_set */
3185 hfsc_class_delete, /* class_delete */
3186 hfsc_class_get_stats, /* class_get_stats */
3187 hfsc_class_dump_stats /* class_dump_stats */
3190 /* "linux-default" traffic control class.
3192 * This class represents the default, unnamed Linux qdisc. It corresponds to
3193 * the "" (empty string) QoS type in the OVS database. */
3196 default_install__(struct netdev *netdev)
3198 struct netdev_dev_linux *netdev_dev =
3199 netdev_dev_linux_cast(netdev_get_dev(netdev));
3200 static struct tc *tc;
3203 tc = xmalloc(sizeof *tc);
3204 tc_init(tc, &tc_ops_default);
3206 netdev_dev->tc = tc;
3210 default_tc_install(struct netdev *netdev,
3211 const struct shash *details OVS_UNUSED)
3213 default_install__(netdev);
3218 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3220 default_install__(netdev);
3224 static const struct tc_ops tc_ops_default = {
3225 NULL, /* linux_name */
3230 NULL, /* tc_destroy */
3231 NULL, /* qdisc_get */
3232 NULL, /* qdisc_set */
3233 NULL, /* class_get */
3234 NULL, /* class_set */
3235 NULL, /* class_delete */
3236 NULL, /* class_get_stats */
3237 NULL /* class_dump_stats */
3240 /* "linux-other" traffic control class.
3245 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3247 struct netdev_dev_linux *netdev_dev =
3248 netdev_dev_linux_cast(netdev_get_dev(netdev));
3249 static struct tc *tc;
3252 tc = xmalloc(sizeof *tc);
3253 tc_init(tc, &tc_ops_other);
3255 netdev_dev->tc = tc;
3259 static const struct tc_ops tc_ops_other = {
3260 NULL, /* linux_name */
3261 "linux-other", /* ovs_name */
3263 NULL, /* tc_install */
3265 NULL, /* tc_destroy */
3266 NULL, /* qdisc_get */
3267 NULL, /* qdisc_set */
3268 NULL, /* class_get */
3269 NULL, /* class_set */
3270 NULL, /* class_delete */
3271 NULL, /* class_get_stats */
3272 NULL /* class_dump_stats */
3275 /* Traffic control. */
3277 /* Number of kernel "tc" ticks per second. */
3278 static double ticks_per_s;
3280 /* Number of kernel "jiffies" per second. This is used for the purpose of
3281 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3282 * one jiffy's worth of data.
3284 * There are two possibilities here:
3286 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3287 * approximate range of 100 to 1024. That means that we really need to
3288 * make sure that the qdisc can buffer that much data.
3290 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3291 * has finely granular timers and there's no need to fudge additional room
3292 * for buffers. (There's no extra effort needed to implement that: the
3293 * large 'buffer_hz' is used as a divisor, so practically any number will
3294 * come out as 0 in the division. Small integer results in the case of
3295 * really high dividends won't have any real effect anyhow.)
3297 static unsigned int buffer_hz;
3299 /* Returns tc handle 'major':'minor'. */
3301 tc_make_handle(unsigned int major, unsigned int minor)
3303 return TC_H_MAKE(major << 16, minor);
3306 /* Returns the major number from 'handle'. */
3308 tc_get_major(unsigned int handle)
3310 return TC_H_MAJ(handle) >> 16;
3313 /* Returns the minor number from 'handle'. */
3315 tc_get_minor(unsigned int handle)
3317 return TC_H_MIN(handle);
3320 static struct tcmsg *
3321 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3322 struct ofpbuf *request)
3324 struct tcmsg *tcmsg;
3328 error = get_ifindex(netdev, &ifindex);
3333 ofpbuf_init(request, 512);
3334 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3335 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3336 tcmsg->tcm_family = AF_UNSPEC;
3337 tcmsg->tcm_ifindex = ifindex;
3338 /* Caller should fill in tcmsg->tcm_handle. */
3339 /* Caller should fill in tcmsg->tcm_parent. */
3345 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3347 int error = nl_sock_transact(rtnl_sock, request, replyp);
3348 ofpbuf_uninit(request);
3355 /* The values in psched are not individually very meaningful, but they are
3356 * important. The tables below show some values seen in the wild.
3360 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3361 * (Before that, there are hints that it was 1000000000.)
3363 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3367 * -----------------------------------
3368 * [1] 000c8000 000f4240 000f4240 00000064
3369 * [2] 000003e8 00000400 000f4240 3b9aca00
3370 * [3] 000003e8 00000400 000f4240 3b9aca00
3371 * [4] 000003e8 00000400 000f4240 00000064
3372 * [5] 000003e8 00000040 000f4240 3b9aca00
3373 * [6] 000003e8 00000040 000f4240 000000f9
3375 * a b c d ticks_per_s buffer_hz
3376 * ------- --------- ---------- ------------- ----------- -------------
3377 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3378 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3379 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3380 * [4] 1,000 1,024 1,000,000 100 976,562 100
3381 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3382 * [6] 1,000 64 1,000,000 249 15,625,000 249
3384 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3385 * [2] 2.6.26-1-686-bigmem from Debian lenny
3386 * [3] 2.6.26-2-sparc64 from Debian lenny
3387 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3388 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3389 * [6] 2.6.34 from kernel.org on KVM
3391 static const char fn[] = "/proc/net/psched";
3392 unsigned int a, b, c, d;
3398 stream = fopen(fn, "r");
3400 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3404 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3405 VLOG_WARN("%s: read failed", fn);
3409 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3413 VLOG_WARN("%s: invalid scheduler parameters", fn);
3417 ticks_per_s = (double) a * c / b;
3421 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3424 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3427 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3428 * rate of 'rate' bytes per second. */
3430 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3435 return (rate * ticks) / ticks_per_s;
3438 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3439 * rate of 'rate' bytes per second. */
3441 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3446 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3449 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3450 * a transmission rate of 'rate' bytes per second. */
3452 tc_buffer_per_jiffy(unsigned int rate)
3457 return rate / buffer_hz;
3460 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3461 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3462 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3463 * stores NULL into it if it is absent.
3465 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3468 * Returns 0 if successful, otherwise a positive errno value. */
3470 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3471 struct nlattr **options)
3473 static const struct nl_policy tca_policy[] = {
3474 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3475 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3477 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3479 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3480 tca_policy, ta, ARRAY_SIZE(ta))) {
3481 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3486 *kind = nl_attr_get_string(ta[TCA_KIND]);
3490 *options = ta[TCA_OPTIONS];
3505 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3506 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3507 * into '*options', and its queue statistics into '*stats'. Any of the output
3508 * arguments may be null.
3510 * Returns 0 if successful, otherwise a positive errno value. */
3512 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3513 struct nlattr **options, struct netdev_queue_stats *stats)
3515 static const struct nl_policy tca_policy[] = {
3516 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3517 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3519 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3521 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3522 tca_policy, ta, ARRAY_SIZE(ta))) {
3523 VLOG_WARN_RL(&rl, "failed to parse class message");
3528 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3529 *handlep = tc->tcm_handle;
3533 *options = ta[TCA_OPTIONS];
3537 const struct gnet_stats_queue *gsq;
3538 struct gnet_stats_basic gsb;
3540 static const struct nl_policy stats_policy[] = {
3541 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3542 .min_len = sizeof gsb },
3543 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3544 .min_len = sizeof *gsq },
3546 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3548 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3549 sa, ARRAY_SIZE(sa))) {
3550 VLOG_WARN_RL(&rl, "failed to parse class stats");
3554 /* Alignment issues screw up the length of struct gnet_stats_basic on
3555 * some arch/bitsize combinations. Newer versions of Linux have a
3556 * struct gnet_stats_basic_packed, but we can't depend on that. The
3557 * easiest thing to do is just to make a copy. */
3558 memset(&gsb, 0, sizeof gsb);
3559 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3560 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3561 stats->tx_bytes = gsb.bytes;
3562 stats->tx_packets = gsb.packets;
3564 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3565 stats->tx_errors = gsq->drops;
3575 memset(stats, 0, sizeof *stats);
3580 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3583 tc_query_class(const struct netdev *netdev,
3584 unsigned int handle, unsigned int parent,
3585 struct ofpbuf **replyp)
3587 struct ofpbuf request;
3588 struct tcmsg *tcmsg;
3591 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3595 tcmsg->tcm_handle = handle;
3596 tcmsg->tcm_parent = parent;
3598 error = tc_transact(&request, replyp);
3600 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3601 netdev_get_name(netdev),
3602 tc_get_major(handle), tc_get_minor(handle),
3603 tc_get_major(parent), tc_get_minor(parent),
3609 /* Equivalent to "tc class del dev <name> handle <handle>". */
3611 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3613 struct ofpbuf request;
3614 struct tcmsg *tcmsg;
3617 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3621 tcmsg->tcm_handle = handle;
3622 tcmsg->tcm_parent = 0;
3624 error = tc_transact(&request, NULL);
3626 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3627 netdev_get_name(netdev),
3628 tc_get_major(handle), tc_get_minor(handle),
3634 /* Equivalent to "tc qdisc del dev <name> root". */
3636 tc_del_qdisc(struct netdev *netdev)
3638 struct netdev_dev_linux *netdev_dev =
3639 netdev_dev_linux_cast(netdev_get_dev(netdev));
3640 struct ofpbuf request;
3641 struct tcmsg *tcmsg;
3644 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3648 tcmsg->tcm_handle = tc_make_handle(1, 0);
3649 tcmsg->tcm_parent = TC_H_ROOT;
3651 error = tc_transact(&request, NULL);
3652 if (error == EINVAL) {
3653 /* EINVAL probably means that the default qdisc was in use, in which
3654 * case we've accomplished our purpose. */
3657 if (!error && netdev_dev->tc) {
3658 if (netdev_dev->tc->ops->tc_destroy) {
3659 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3661 netdev_dev->tc = NULL;
3666 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3667 * kernel to determine what they are. Returns 0 if successful, otherwise a
3668 * positive errno value. */
3670 tc_query_qdisc(const struct netdev *netdev)
3672 struct netdev_dev_linux *netdev_dev =
3673 netdev_dev_linux_cast(netdev_get_dev(netdev));
3674 struct ofpbuf request, *qdisc;
3675 const struct tc_ops *ops;
3676 struct tcmsg *tcmsg;
3680 if (netdev_dev->tc) {
3684 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3685 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3686 * 2.6.35 without that fix backported to it.
3688 * To avoid the OOPS, we must not make a request that would attempt to dump
3689 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3690 * few others. There are a few ways that I can see to do this, but most of
3691 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3692 * technique chosen here is to assume that any non-default qdisc that we
3693 * create will have a class with handle 1:0. The built-in qdiscs only have
3694 * a class with handle 0:0.
3696 * We could check for Linux 2.6.35+ and use a more straightforward method
3698 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3702 tcmsg->tcm_handle = tc_make_handle(1, 0);
3703 tcmsg->tcm_parent = 0;
3705 /* Figure out what tc class to instantiate. */
3706 error = tc_transact(&request, &qdisc);
3710 error = tc_parse_qdisc(qdisc, &kind, NULL);
3712 ops = &tc_ops_other;
3714 ops = tc_lookup_linux_name(kind);
3716 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3717 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3719 ops = &tc_ops_other;
3722 } else if (error == ENOENT) {
3723 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3724 * other entity that doesn't have a handle 1:0. We will assume
3725 * that it's the system default qdisc. */
3726 ops = &tc_ops_default;
3729 /* Who knows? Maybe the device got deleted. */
3730 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3731 netdev_get_name(netdev), strerror(error));
3732 ops = &tc_ops_other;
3735 /* Instantiate it. */
3736 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3737 assert((load_error == 0) == (netdev_dev->tc != NULL));
3738 ofpbuf_delete(qdisc);
3740 return error ? error : load_error;
3743 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3744 approximate the time to transmit packets of various lengths. For an MTU of
3745 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3746 represents two possible packet lengths; for a MTU of 513 through 1024, four
3747 possible lengths; and so on.
3749 Returns, for the specified 'mtu', the number of bits that packet lengths
3750 need to be shifted right to fit within such a 256-entry table. */
3752 tc_calc_cell_log(unsigned int mtu)
3757 mtu = ETH_PAYLOAD_MAX;
3759 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3761 for (cell_log = 0; mtu >= 256; cell_log++) {
3768 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3771 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3773 memset(rate, 0, sizeof *rate);
3774 rate->cell_log = tc_calc_cell_log(mtu);
3775 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3776 /* rate->cell_align = 0; */ /* distro headers. */
3777 rate->mpu = ETH_TOTAL_MIN;
3781 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3782 * attribute of the specified "type".
3784 * See tc_calc_cell_log() above for a description of "rtab"s. */
3786 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3791 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3792 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3793 unsigned packet_size = (i + 1) << rate->cell_log;
3794 if (packet_size < rate->mpu) {
3795 packet_size = rate->mpu;
3797 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3801 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3802 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3803 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3806 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3808 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3809 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3813 /* Utility functions. */
3816 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3818 /* Policy for RTNLGRP_LINK messages.
3820 * There are *many* more fields in these messages, but currently we only
3821 * care about these fields. */
3822 static const struct nl_policy rtnlgrp_link_policy[] = {
3823 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3824 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3825 .min_len = sizeof(struct rtnl_link_stats) },
3828 struct ofpbuf request;
3829 struct ofpbuf *reply;
3830 struct ifinfomsg *ifi;
3831 const struct rtnl_link_stats *rtnl_stats;
3832 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3835 ofpbuf_init(&request, 0);
3836 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3837 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3838 ifi->ifi_family = PF_UNSPEC;
3839 ifi->ifi_index = ifindex;
3840 error = nl_sock_transact(rtnl_sock, &request, &reply);
3841 ofpbuf_uninit(&request);
3846 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3847 rtnlgrp_link_policy,
3848 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3849 ofpbuf_delete(reply);
3853 if (!attrs[IFLA_STATS]) {
3854 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3855 ofpbuf_delete(reply);
3859 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3860 stats->rx_packets = rtnl_stats->rx_packets;
3861 stats->tx_packets = rtnl_stats->tx_packets;
3862 stats->rx_bytes = rtnl_stats->rx_bytes;
3863 stats->tx_bytes = rtnl_stats->tx_bytes;
3864 stats->rx_errors = rtnl_stats->rx_errors;
3865 stats->tx_errors = rtnl_stats->tx_errors;
3866 stats->rx_dropped = rtnl_stats->rx_dropped;
3867 stats->tx_dropped = rtnl_stats->tx_dropped;
3868 stats->multicast = rtnl_stats->multicast;
3869 stats->collisions = rtnl_stats->collisions;
3870 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3871 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3872 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3873 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3874 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3875 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3876 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3877 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3878 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3879 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3880 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3882 ofpbuf_delete(reply);
3888 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3890 static const char fn[] = "/proc/net/dev";
3895 stream = fopen(fn, "r");
3897 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3902 while (fgets(line, sizeof line, stream)) {
3905 #define X64 "%"SCNu64
3908 X64 X64 X64 X64 X64 X64 X64 "%*u"
3909 X64 X64 X64 X64 X64 X64 X64 "%*u",
3915 &stats->rx_fifo_errors,
3916 &stats->rx_frame_errors,
3922 &stats->tx_fifo_errors,
3924 &stats->tx_carrier_errors) != 15) {
3925 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3926 } else if (!strcmp(devname, netdev_name)) {
3927 stats->rx_length_errors = UINT64_MAX;
3928 stats->rx_over_errors = UINT64_MAX;
3929 stats->rx_crc_errors = UINT64_MAX;
3930 stats->rx_missed_errors = UINT64_MAX;
3931 stats->tx_aborted_errors = UINT64_MAX;
3932 stats->tx_heartbeat_errors = UINT64_MAX;
3933 stats->tx_window_errors = UINT64_MAX;
3939 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3945 get_flags(const struct netdev *netdev, int *flags)
3950 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3952 *flags = ifr.ifr_flags;
3957 set_flags(struct netdev *netdev, int flags)
3961 ifr.ifr_flags = flags;
3962 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3967 do_get_ifindex(const char *netdev_name)
3971 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3972 COVERAGE_INC(netdev_get_ifindex);
3973 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3974 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3975 netdev_name, strerror(errno));
3978 return ifr.ifr_ifindex;
3982 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3984 struct netdev_dev_linux *netdev_dev =
3985 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3987 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3988 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3992 netdev_dev->cache_valid |= VALID_IFINDEX;
3993 netdev_dev->ifindex = ifindex;
3995 *ifindexp = netdev_dev->ifindex;
4000 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4005 memset(&ifr, 0, sizeof ifr);
4006 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4007 COVERAGE_INC(netdev_get_hwaddr);
4008 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4009 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4010 netdev_name, strerror(errno));
4013 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4014 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4015 VLOG_WARN("%s device has unknown hardware address family %d",
4016 netdev_name, hwaddr_family);
4018 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4023 set_etheraddr(const char *netdev_name, int hwaddr_family,
4024 const uint8_t mac[ETH_ADDR_LEN])
4028 memset(&ifr, 0, sizeof ifr);
4029 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4030 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4031 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4032 COVERAGE_INC(netdev_set_hwaddr);
4033 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4034 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4035 netdev_name, strerror(errno));
4042 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4043 int cmd, const char *cmd_name)
4047 memset(&ifr, 0, sizeof ifr);
4048 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4049 ifr.ifr_data = (caddr_t) ecmd;
4052 COVERAGE_INC(netdev_ethtool);
4053 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4056 if (errno != EOPNOTSUPP) {
4057 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4058 "failed: %s", cmd_name, name, strerror(errno));
4060 /* The device doesn't support this operation. That's pretty
4061 * common, so there's no point in logging anything. */
4068 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4069 const char *cmd_name)
4071 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4072 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4073 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4081 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4082 int cmd, const char *cmd_name)
4087 ifr.ifr_addr.sa_family = AF_INET;
4088 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4090 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4091 *ip = sin->sin_addr;