2 * Copyright (c) 2009, 2010 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <arpa/inet.h>
23 #include <linux/gen_stats.h>
24 #include <linux/if_tun.h>
26 #include <linux/types.h>
27 #include <linux/ethtool.h>
28 #include <linux/pkt_sched.h>
29 #include <linux/rtnetlink.h>
30 #include <linux/sockios.h>
31 #include <linux/version.h>
32 #include <sys/types.h>
33 #include <sys/ioctl.h>
34 #include <sys/socket.h>
35 #include <netpacket/packet.h>
36 #include <net/ethernet.h>
38 #include <linux/if_tunnel.h>
39 #include <net/if_arp.h>
40 #include <net/if_packet.h>
41 #include <net/route.h>
42 #include <netinet/in.h>
49 #include "dynamic-string.h"
50 #include "fatal-signal.h"
53 #include "netdev-provider.h"
54 #include "netdev-vport.h"
56 #include "netlink-socket.h"
58 #include "openflow/openflow.h"
60 #include "poll-loop.h"
61 #include "rtnetlink.h"
62 #include "socket-util.h"
67 VLOG_DEFINE_THIS_MODULE(netdev_linux);
69 COVERAGE_DEFINE(netdev_get_vlan_vid);
70 COVERAGE_DEFINE(netdev_set_policing);
71 COVERAGE_DEFINE(netdev_arp_lookup);
72 COVERAGE_DEFINE(netdev_get_ifindex);
73 COVERAGE_DEFINE(netdev_get_hwaddr);
74 COVERAGE_DEFINE(netdev_set_hwaddr);
75 COVERAGE_DEFINE(netdev_ethtool);
77 /* These were introduced in Linux 2.6.14, so they might be missing if we have
79 #ifndef ADVERTISED_Pause
80 #define ADVERTISED_Pause (1 << 13)
82 #ifndef ADVERTISED_Asym_Pause
83 #define ADVERTISED_Asym_Pause (1 << 14)
86 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
89 #define TC_RTAB_SIZE 1024
92 static struct rtnetlink_notifier netdev_linux_cache_notifier;
93 static int cache_notifier_refcount;
96 VALID_IFINDEX = 1 << 0,
97 VALID_ETHERADDR = 1 << 1,
101 VALID_CARRIER = 1 << 5,
102 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
103 VALID_POLICING = 1 << 7,
104 VALID_HAVE_VPORT_STATS = 1 << 8
112 /* Traffic control. */
114 /* An instance of a traffic control class. Always associated with a particular
117 * Each TC implementation subclasses this with whatever additional data it
120 const struct tc_ops *ops;
121 struct hmap queues; /* Contains "struct tc_queue"s.
122 * Read by generic TC layer.
123 * Written only by TC implementation. */
126 /* One traffic control queue.
128 * Each TC implementation subclasses this with whatever additional data it
131 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
132 unsigned int queue_id; /* OpenFlow queue ID. */
135 /* A particular kind of traffic control. Each implementation generally maps to
136 * one particular Linux qdisc class.
138 * The functions below return 0 if successful or a positive errno value on
139 * failure, except where otherwise noted. All of them must be provided, except
140 * where otherwise noted. */
142 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
143 * This is null for tc_ops_default and tc_ops_other, for which there are no
144 * appropriate values. */
145 const char *linux_name;
147 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
148 const char *ovs_name;
150 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
151 * queues. The queues are numbered 0 through n_queues - 1. */
152 unsigned int n_queues;
154 /* Called to install this TC class on 'netdev'. The implementation should
155 * make the Netlink calls required to set up 'netdev' with the right qdisc
156 * and configure it according to 'details'. The implementation may assume
157 * that the current qdisc is the default; that is, there is no need for it
158 * to delete the current qdisc before installing itself.
160 * The contents of 'details' should be documented as valid for 'ovs_name'
161 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
162 * (which is built as ovs-vswitchd.conf.db(8)).
164 * This function must return 0 if and only if it sets 'netdev->tc' to an
165 * initialized 'struct tc'.
167 * (This function is null for tc_ops_other, which cannot be installed. For
168 * other TC classes it should always be nonnull.) */
169 int (*tc_install)(struct netdev *netdev, const struct shash *details);
171 /* Called when the netdev code determines (through a Netlink query) that
172 * this TC class's qdisc is installed on 'netdev', but we didn't install
173 * it ourselves and so don't know any of the details.
175 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
176 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
177 * implementation should parse the other attributes of 'nlmsg' as
178 * necessary to determine its configuration. If necessary it should also
179 * use Netlink queries to determine the configuration of queues on
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'. */
184 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
186 /* Destroys the data structures allocated by the implementation as part of
187 * 'tc'. (This includes destroying 'tc->queues' by calling
190 * The implementation should not need to perform any Netlink calls. If
191 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
192 * (But it may not be desirable.)
194 * This function may be null if 'tc' is trivial. */
195 void (*tc_destroy)(struct tc *tc);
197 /* Retrieves details of 'netdev->tc' configuration into 'details'.
199 * The implementation should not need to perform any Netlink calls, because
200 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
201 * cached the configuration.
203 * The contents of 'details' should be documented as valid for 'ovs_name'
204 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
205 * (which is built as ovs-vswitchd.conf.db(8)).
207 * This function may be null if 'tc' is not configurable.
209 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
211 /* Reconfigures 'netdev->tc' according to 'details', performing any
212 * required Netlink calls to complete the reconfiguration.
214 * The contents of 'details' should be documented as valid for 'ovs_name'
215 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
216 * (which is built as ovs-vswitchd.conf.db(8)).
218 * This function may be null if 'tc' is not configurable.
220 int (*qdisc_set)(struct netdev *, const struct shash *details);
222 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
223 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
225 * The contents of 'details' should be documented as valid for 'ovs_name'
226 * in the "other_config" column in the "Queue" table in
227 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
229 * The implementation should not need to perform any Netlink calls, because
230 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
231 * cached the queue configuration.
233 * This function may be null if 'tc' does not have queues ('n_queues' is
235 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
236 struct shash *details);
238 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
239 * 'details', perfoming any required Netlink calls to complete the
240 * reconfiguration. The caller ensures that 'queue_id' is less than
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
247 * This function may be null if 'tc' does not have queues or its queues are
248 * not configurable. */
249 int (*class_set)(struct netdev *, unsigned int queue_id,
250 const struct shash *details);
252 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
253 * tc_queue's within 'netdev->tc->queues'.
255 * This function may be null if 'tc' does not have queues or its queues
256 * cannot be deleted. */
257 int (*class_delete)(struct netdev *, struct tc_queue *queue);
259 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
260 * 'struct tc_queue's within 'netdev->tc->queues'.
262 * On success, initializes '*stats'.
264 * This function may be null if 'tc' does not have queues or if it cannot
265 * report queue statistics. */
266 int (*class_get_stats)(const struct netdev *netdev,
267 const struct tc_queue *queue,
268 struct netdev_queue_stats *stats);
270 /* Extracts queue stats from 'nlmsg', which is a response to a
271 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
273 * This function may be null if 'tc' does not have queues or if it cannot
274 * report queue statistics. */
275 int (*class_dump_stats)(const struct netdev *netdev,
276 const struct ofpbuf *nlmsg,
277 netdev_dump_queue_stats_cb *cb, void *aux);
281 tc_init(struct tc *tc, const struct tc_ops *ops)
284 hmap_init(&tc->queues);
288 tc_destroy(struct tc *tc)
290 hmap_destroy(&tc->queues);
293 static const struct tc_ops tc_ops_htb;
294 static const struct tc_ops tc_ops_hfsc;
295 static const struct tc_ops tc_ops_default;
296 static const struct tc_ops tc_ops_other;
298 static const struct tc_ops *tcs[] = {
299 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
300 &tc_ops_hfsc, /* Hierarchical fair service curve. */
301 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
302 &tc_ops_other, /* Some other qdisc. */
306 static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
307 static unsigned int tc_get_major(unsigned int handle);
308 static unsigned int tc_get_minor(unsigned int handle);
310 static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
311 static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
312 static unsigned int tc_buffer_per_jiffy(unsigned int rate);
314 static struct tcmsg *tc_make_request(const struct netdev *, int type,
315 unsigned int flags, struct ofpbuf *);
316 static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
318 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
319 struct nlattr **options);
320 static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
321 struct nlattr **options,
322 struct netdev_queue_stats *);
323 static int tc_query_class(const struct netdev *,
324 unsigned int handle, unsigned int parent,
325 struct ofpbuf **replyp);
326 static int tc_delete_class(const struct netdev *, unsigned int handle);
328 static int tc_del_qdisc(struct netdev *netdev);
329 static int tc_query_qdisc(const struct netdev *netdev);
331 static int tc_calc_cell_log(unsigned int mtu);
332 static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
333 static void tc_put_rtab(struct ofpbuf *, uint16_t type,
334 const struct tc_ratespec *rate);
335 static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
337 struct netdev_dev_linux {
338 struct netdev_dev netdev_dev;
340 struct shash_node *shash_node;
341 unsigned int cache_valid;
343 /* The following are figured out "on demand" only. They are only valid
344 * when the corresponding VALID_* bit in 'cache_valid' is set. */
346 uint8_t etheraddr[ETH_ADDR_LEN];
347 struct in_addr address, netmask;
351 bool is_internal; /* Is this an openvswitch internal device? */
352 bool is_tap; /* Is this a tuntap device? */
353 uint32_t kbits_rate; /* Policing data. */
354 uint32_t kbits_burst;
355 bool have_vport_stats;
359 struct tap_state tap;
363 struct netdev_linux {
364 struct netdev netdev;
368 /* An AF_INET socket (used for ioctl operations). */
369 static int af_inet_sock = -1;
371 /* A Netlink routing socket that is not subscribed to any multicast groups. */
372 static struct nl_sock *rtnl_sock;
374 struct netdev_linux_notifier {
375 struct netdev_notifier notifier;
379 static struct shash netdev_linux_notifiers =
380 SHASH_INITIALIZER(&netdev_linux_notifiers);
381 static struct rtnetlink_notifier netdev_linux_poll_notifier;
383 /* This is set pretty low because we probably won't learn anything from the
384 * additional log messages. */
385 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
387 static int netdev_linux_init(void);
389 static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
390 int cmd, const char *cmd_name);
391 static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
392 const char *cmd_name);
393 static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
394 int cmd, const char *cmd_name);
395 static int get_flags(const struct netdev *, int *flagsp);
396 static int set_flags(struct netdev *, int flags);
397 static int do_get_ifindex(const char *netdev_name);
398 static int get_ifindex(const struct netdev *, int *ifindexp);
399 static int do_set_addr(struct netdev *netdev,
400 int ioctl_nr, const char *ioctl_name,
401 struct in_addr addr);
402 static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
403 static int set_etheraddr(const char *netdev_name, int hwaddr_family,
404 const uint8_t[ETH_ADDR_LEN]);
405 static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
406 static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
409 is_netdev_linux_class(const struct netdev_class *netdev_class)
411 return netdev_class->init == netdev_linux_init;
414 static struct netdev_dev_linux *
415 netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
417 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
418 assert(is_netdev_linux_class(netdev_class));
420 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
423 static struct netdev_linux *
424 netdev_linux_cast(const struct netdev *netdev)
426 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
427 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
428 assert(is_netdev_linux_class(netdev_class));
430 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
434 netdev_linux_init(void)
436 static int status = -1;
438 /* Create AF_INET socket. */
439 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
440 status = af_inet_sock >= 0 ? 0 : errno;
442 VLOG_ERR("failed to create inet socket: %s", strerror(status));
445 /* Create rtnetlink socket. */
447 status = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
449 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
458 netdev_linux_run(void)
460 rtnetlink_notifier_run();
464 netdev_linux_wait(void)
466 rtnetlink_notifier_wait();
470 netdev_linux_cache_cb(const struct rtnetlink_change *change,
471 void *aux OVS_UNUSED)
473 struct netdev_dev_linux *dev;
475 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
477 const struct netdev_class *netdev_class =
478 netdev_dev_get_class(base_dev);
480 if (is_netdev_linux_class(netdev_class)) {
481 dev = netdev_dev_linux_cast(base_dev);
482 dev->cache_valid = 0;
486 struct shash device_shash;
487 struct shash_node *node;
489 shash_init(&device_shash);
490 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
491 SHASH_FOR_EACH (node, &device_shash) {
493 dev->cache_valid = 0;
495 shash_destroy(&device_shash);
499 /* Creates system and internal devices. */
501 netdev_linux_create(const struct netdev_class *class,
502 const char *name, const struct shash *args,
503 struct netdev_dev **netdev_devp)
505 struct netdev_dev_linux *netdev_dev;
508 if (!shash_is_empty(args)) {
509 VLOG_WARN("%s: arguments for %s devices should be empty",
513 if (!cache_notifier_refcount) {
514 error = rtnetlink_notifier_register(&netdev_linux_cache_notifier,
515 netdev_linux_cache_cb, NULL);
520 cache_notifier_refcount++;
522 netdev_dev = xzalloc(sizeof *netdev_dev);
523 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
525 *netdev_devp = &netdev_dev->netdev_dev;
529 /* For most types of netdevs we open the device for each call of
530 * netdev_open(). However, this is not the case with tap devices,
531 * since it is only possible to open the device once. In this
532 * situation we share a single file descriptor, and consequently
533 * buffers, across all readers. Therefore once data is read it will
534 * be unavailable to other reads for tap devices. */
536 netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
537 const char *name, const struct shash *args,
538 struct netdev_dev **netdev_devp)
540 struct netdev_dev_linux *netdev_dev;
541 struct tap_state *state;
542 static const char tap_dev[] = "/dev/net/tun";
546 if (!shash_is_empty(args)) {
547 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
550 netdev_dev = xzalloc(sizeof *netdev_dev);
551 state = &netdev_dev->state.tap;
553 /* Open tap device. */
554 state->fd = open(tap_dev, O_RDWR);
557 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
561 /* Create tap device. */
562 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
563 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
564 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
565 VLOG_WARN("%s: creating tap device failed: %s", name,
571 /* Make non-blocking. */
572 error = set_nonblocking(state->fd);
577 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
578 *netdev_devp = &netdev_dev->netdev_dev;
587 destroy_tap(struct netdev_dev_linux *netdev_dev)
589 struct tap_state *state = &netdev_dev->state.tap;
591 if (state->fd >= 0) {
596 /* Destroys the netdev device 'netdev_dev_'. */
598 netdev_linux_destroy(struct netdev_dev *netdev_dev_)
600 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
601 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
603 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
604 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
607 if (class == &netdev_linux_class || class == &netdev_internal_class) {
608 cache_notifier_refcount--;
610 if (!cache_notifier_refcount) {
611 rtnetlink_notifier_unregister(&netdev_linux_cache_notifier);
613 } else if (class == &netdev_tap_class) {
614 destroy_tap(netdev_dev);
623 netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
624 struct netdev **netdevp)
626 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
627 struct netdev_linux *netdev;
628 enum netdev_flags flags;
631 /* Allocate network device. */
632 netdev = xzalloc(sizeof *netdev);
634 netdev_init(&netdev->netdev, netdev_dev_);
636 /* Verify that the device really exists, by attempting to read its flags.
637 * (The flags might be cached, in which case this won't actually do an
640 * Don't do this for "internal" netdevs, though, because those have to be
641 * created as netdev objects before they exist in the kernel, because
642 * creating them in the kernel happens by passing a netdev object to
643 * dpif_port_add(). */
644 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
645 error = netdev_get_flags(&netdev->netdev, &flags);
646 if (error == ENODEV) {
651 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
652 !netdev_dev->state.tap.opened) {
654 /* We assume that the first user of the tap device is the primary user
655 * and give them the tap FD. Subsequent users probably just expect
656 * this to be a system device so open it normally to avoid send/receive
657 * directions appearing to be reversed. */
658 netdev->fd = netdev_dev->state.tap.fd;
659 netdev_dev->state.tap.opened = true;
660 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
661 struct sockaddr_ll sll;
665 /* Create file descriptor. */
666 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
667 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
669 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
670 if (netdev->fd < 0) {
675 /* Set non-blocking mode. */
676 error = set_nonblocking(netdev->fd);
681 /* Get ethernet device index. */
682 error = get_ifindex(&netdev->netdev, &ifindex);
687 /* Bind to specific ethernet device. */
688 memset(&sll, 0, sizeof sll);
689 sll.sll_family = AF_PACKET;
690 sll.sll_ifindex = ifindex;
692 (struct sockaddr *) &sll, sizeof sll) < 0) {
694 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
699 /* Between the socket() and bind() calls above, the socket receives all
700 * packets of the requested type on all system interfaces. We do not
701 * want to receive that data, but there is no way to avoid it. So we
702 * must now drain out the receive queue. */
703 error = drain_rcvbuf(netdev->fd);
709 *netdevp = &netdev->netdev;
713 netdev_uninit(&netdev->netdev, true);
717 /* Closes and destroys 'netdev'. */
719 netdev_linux_close(struct netdev *netdev_)
721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
723 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
729 /* Initializes 'svec' with a list of the names of all known network devices. */
731 netdev_linux_enumerate(struct svec *svec)
733 struct if_nameindex *names;
735 names = if_nameindex();
739 for (i = 0; names[i].if_name != NULL; i++) {
740 svec_add(svec, names[i].if_name);
742 if_freenameindex(names);
745 VLOG_WARN("could not obtain list of network device names: %s",
752 netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
754 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756 if (netdev->fd < 0) {
757 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
762 ssize_t retval = read(netdev->fd, data, size);
765 } else if (errno != EINTR) {
766 if (errno != EAGAIN) {
767 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
768 strerror(errno), netdev_get_name(netdev_));
775 /* Registers with the poll loop to wake up from the next call to poll_block()
776 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
778 netdev_linux_recv_wait(struct netdev *netdev_)
780 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
781 if (netdev->fd >= 0) {
782 poll_fd_wait(netdev->fd, POLLIN);
786 /* Discards all packets waiting to be received from 'netdev'. */
788 netdev_linux_drain(struct netdev *netdev_)
790 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
791 if (netdev->fd < 0) {
793 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
795 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
796 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
800 drain_fd(netdev->fd, ifr.ifr_qlen);
803 return drain_rcvbuf(netdev->fd);
807 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
808 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
809 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
810 * the packet is too big or too small to transmit on the device.
812 * The caller retains ownership of 'buffer' in all cases.
814 * The kernel maintains a packet transmission queue, so the caller is not
815 * expected to do additional queuing of packets. */
817 netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
819 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
821 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
823 if (netdev->fd < 0) {
828 ssize_t retval = write(netdev->fd, data, size);
830 /* The Linux AF_PACKET implementation never blocks waiting for room
831 * for packets, instead returning ENOBUFS. Translate this into
832 * EAGAIN for the caller. */
833 if (errno == ENOBUFS) {
835 } else if (errno == EINTR) {
837 } else if (errno != EAGAIN) {
838 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
839 netdev_get_name(netdev_), strerror(errno));
842 } else if (retval != size) {
843 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
844 "%zu) on %s", retval, size, netdev_get_name(netdev_));
852 /* Registers with the poll loop to wake up from the next call to poll_block()
853 * when the packet transmission queue has sufficient room to transmit a packet
854 * with netdev_send().
856 * The kernel maintains a packet transmission queue, so the client is not
857 * expected to do additional queuing of packets. Thus, this function is
858 * unlikely to ever be used. It is included for completeness. */
860 netdev_linux_send_wait(struct netdev *netdev_)
862 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
863 if (netdev->fd < 0) {
865 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
866 poll_fd_wait(netdev->fd, POLLOUT);
868 /* TAP device always accepts packets.*/
869 poll_immediate_wake();
873 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
874 * otherwise a positive errno value. */
876 netdev_linux_set_etheraddr(struct netdev *netdev_,
877 const uint8_t mac[ETH_ADDR_LEN])
879 struct netdev_dev_linux *netdev_dev =
880 netdev_dev_linux_cast(netdev_get_dev(netdev_));
883 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
884 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
885 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
887 netdev_dev->cache_valid |= VALID_ETHERADDR;
888 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
896 /* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
897 * free the returned buffer. */
899 netdev_linux_get_etheraddr(const struct netdev *netdev_,
900 uint8_t mac[ETH_ADDR_LEN])
902 struct netdev_dev_linux *netdev_dev =
903 netdev_dev_linux_cast(netdev_get_dev(netdev_));
904 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
905 int error = get_etheraddr(netdev_get_name(netdev_),
906 netdev_dev->etheraddr);
910 netdev_dev->cache_valid |= VALID_ETHERADDR;
912 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
916 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
917 * in bytes, not including the hardware header; thus, this is typically 1500
918 * bytes for Ethernet devices. */
920 netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
922 struct netdev_dev_linux *netdev_dev =
923 netdev_dev_linux_cast(netdev_get_dev(netdev_));
924 if (!(netdev_dev->cache_valid & VALID_MTU)) {
928 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
929 SIOCGIFMTU, "SIOCGIFMTU");
933 netdev_dev->mtu = ifr.ifr_mtu;
934 netdev_dev->cache_valid |= VALID_MTU;
936 *mtup = netdev_dev->mtu;
940 /* Returns the ifindex of 'netdev', if successful, as a positive number.
941 * On failure, returns a negative errno value. */
943 netdev_linux_get_ifindex(const struct netdev *netdev)
947 error = get_ifindex(netdev, &ifindex);
948 return error ? -error : ifindex;
952 netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
954 struct netdev_dev_linux *netdev_dev =
955 netdev_dev_linux_cast(netdev_get_dev(netdev_));
960 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
964 fn = xasprintf("/sys/class/net/%s/carrier",
965 netdev_get_name(netdev_));
966 fd = open(fn, O_RDONLY);
969 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
973 retval = read(fd, line, sizeof line);
976 if (error == EINVAL) {
977 /* This is the normal return value when we try to check carrier
978 * if the network device is not up. */
980 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
983 } else if (retval == 0) {
985 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
989 if (line[0] != '0' && line[0] != '1') {
991 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
995 netdev_dev->carrier = line[0] != '0';
996 netdev_dev->cache_valid |= VALID_CARRIER;
998 *carrier = netdev_dev->carrier;
1009 /* Check whether we can we use RTM_GETLINK to get network device statistics.
1010 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1013 check_for_working_netlink_stats(void)
1015 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1016 * preferable, so if that works, we'll use it. */
1017 int ifindex = do_get_ifindex("lo");
1019 VLOG_WARN("failed to get ifindex for lo, "
1020 "obtaining netdev stats from proc");
1023 struct netdev_stats stats;
1024 int error = get_stats_via_netlink(ifindex, &stats);
1026 VLOG_DBG("obtaining netdev stats via rtnetlink");
1029 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1030 "via proc (you are probably running a pre-2.6.19 "
1031 "kernel)", strerror(error));
1037 /* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1039 netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1041 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1042 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1043 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
1045 netdev_dev->is_tap = !strcmp(type, "tap");
1046 netdev_dev->is_internal = false;
1047 if (!netdev_dev->is_tap) {
1048 struct ethtool_drvinfo drvinfo;
1051 memset(&drvinfo, 0, sizeof drvinfo);
1052 error = netdev_linux_do_ethtool(name,
1053 (struct ethtool_cmd *)&drvinfo,
1055 "ETHTOOL_GDRVINFO");
1057 if (!error && !strcmp(drvinfo.driver, "openvswitch")) {
1058 netdev_dev->is_internal = true;
1062 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1067 swap_uint64(uint64_t *a, uint64_t *b)
1074 /* Retrieves current device stats for 'netdev'. */
1076 netdev_linux_get_stats(const struct netdev *netdev_,
1077 struct netdev_stats *stats)
1079 struct netdev_dev_linux *netdev_dev =
1080 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1081 static int use_netlink_stats = -1;
1084 if (netdev_dev->have_vport_stats ||
1085 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1087 error = netdev_vport_get_stats(netdev_, stats);
1088 netdev_dev->have_vport_stats = !error;
1089 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1092 if (!netdev_dev->have_vport_stats) {
1093 if (use_netlink_stats < 0) {
1094 use_netlink_stats = check_for_working_netlink_stats();
1096 if (use_netlink_stats) {
1099 error = get_ifindex(netdev_, &ifindex);
1101 error = get_stats_via_netlink(ifindex, stats);
1104 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1108 /* If this port is an internal port then the transmit and receive stats
1109 * will appear to be swapped relative to the other ports since we are the
1110 * one sending the data, not a remote computer. For consistency, we swap
1111 * them back here. This does not apply if we are getting stats from the
1112 * vport layer because it always tracks stats from the perspective of the
1114 netdev_linux_update_is_pseudo(netdev_dev);
1115 if (!error && !netdev_dev->have_vport_stats &&
1116 (netdev_dev->is_internal || netdev_dev->is_tap)) {
1117 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1118 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1119 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1120 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1121 stats->rx_length_errors = 0;
1122 stats->rx_over_errors = 0;
1123 stats->rx_crc_errors = 0;
1124 stats->rx_frame_errors = 0;
1125 stats->rx_fifo_errors = 0;
1126 stats->rx_missed_errors = 0;
1127 stats->tx_aborted_errors = 0;
1128 stats->tx_carrier_errors = 0;
1129 stats->tx_fifo_errors = 0;
1130 stats->tx_heartbeat_errors = 0;
1131 stats->tx_window_errors = 0;
1137 /* Stores the features supported by 'netdev' into each of '*current',
1138 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1139 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1140 * successful, otherwise a positive errno value. */
1142 netdev_linux_get_features(struct netdev *netdev,
1143 uint32_t *current, uint32_t *advertised,
1144 uint32_t *supported, uint32_t *peer)
1146 struct ethtool_cmd ecmd;
1149 memset(&ecmd, 0, sizeof ecmd);
1150 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1151 ETHTOOL_GSET, "ETHTOOL_GSET");
1156 /* Supported features. */
1158 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1159 *supported |= OFPPF_10MB_HD;
1161 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1162 *supported |= OFPPF_10MB_FD;
1164 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1165 *supported |= OFPPF_100MB_HD;
1167 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1168 *supported |= OFPPF_100MB_FD;
1170 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1171 *supported |= OFPPF_1GB_HD;
1173 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1174 *supported |= OFPPF_1GB_FD;
1176 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1177 *supported |= OFPPF_10GB_FD;
1179 if (ecmd.supported & SUPPORTED_TP) {
1180 *supported |= OFPPF_COPPER;
1182 if (ecmd.supported & SUPPORTED_FIBRE) {
1183 *supported |= OFPPF_FIBER;
1185 if (ecmd.supported & SUPPORTED_Autoneg) {
1186 *supported |= OFPPF_AUTONEG;
1188 if (ecmd.supported & SUPPORTED_Pause) {
1189 *supported |= OFPPF_PAUSE;
1191 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1192 *supported |= OFPPF_PAUSE_ASYM;
1195 /* Advertised features. */
1197 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1198 *advertised |= OFPPF_10MB_HD;
1200 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1201 *advertised |= OFPPF_10MB_FD;
1203 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1204 *advertised |= OFPPF_100MB_HD;
1206 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1207 *advertised |= OFPPF_100MB_FD;
1209 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1210 *advertised |= OFPPF_1GB_HD;
1212 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1213 *advertised |= OFPPF_1GB_FD;
1215 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1216 *advertised |= OFPPF_10GB_FD;
1218 if (ecmd.advertising & ADVERTISED_TP) {
1219 *advertised |= OFPPF_COPPER;
1221 if (ecmd.advertising & ADVERTISED_FIBRE) {
1222 *advertised |= OFPPF_FIBER;
1224 if (ecmd.advertising & ADVERTISED_Autoneg) {
1225 *advertised |= OFPPF_AUTONEG;
1227 if (ecmd.advertising & ADVERTISED_Pause) {
1228 *advertised |= OFPPF_PAUSE;
1230 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1231 *advertised |= OFPPF_PAUSE_ASYM;
1234 /* Current settings. */
1235 if (ecmd.speed == SPEED_10) {
1236 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1237 } else if (ecmd.speed == SPEED_100) {
1238 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1239 } else if (ecmd.speed == SPEED_1000) {
1240 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1241 } else if (ecmd.speed == SPEED_10000) {
1242 *current = OFPPF_10GB_FD;
1247 if (ecmd.port == PORT_TP) {
1248 *current |= OFPPF_COPPER;
1249 } else if (ecmd.port == PORT_FIBRE) {
1250 *current |= OFPPF_FIBER;
1254 *current |= OFPPF_AUTONEG;
1257 /* Peer advertisements. */
1258 *peer = 0; /* XXX */
1263 /* Set the features advertised by 'netdev' to 'advertise'. */
1265 netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1267 struct ethtool_cmd ecmd;
1270 memset(&ecmd, 0, sizeof ecmd);
1271 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1272 ETHTOOL_GSET, "ETHTOOL_GSET");
1277 ecmd.advertising = 0;
1278 if (advertise & OFPPF_10MB_HD) {
1279 ecmd.advertising |= ADVERTISED_10baseT_Half;
1281 if (advertise & OFPPF_10MB_FD) {
1282 ecmd.advertising |= ADVERTISED_10baseT_Full;
1284 if (advertise & OFPPF_100MB_HD) {
1285 ecmd.advertising |= ADVERTISED_100baseT_Half;
1287 if (advertise & OFPPF_100MB_FD) {
1288 ecmd.advertising |= ADVERTISED_100baseT_Full;
1290 if (advertise & OFPPF_1GB_HD) {
1291 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1293 if (advertise & OFPPF_1GB_FD) {
1294 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1296 if (advertise & OFPPF_10GB_FD) {
1297 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1299 if (advertise & OFPPF_COPPER) {
1300 ecmd.advertising |= ADVERTISED_TP;
1302 if (advertise & OFPPF_FIBER) {
1303 ecmd.advertising |= ADVERTISED_FIBRE;
1305 if (advertise & OFPPF_AUTONEG) {
1306 ecmd.advertising |= ADVERTISED_Autoneg;
1308 if (advertise & OFPPF_PAUSE) {
1309 ecmd.advertising |= ADVERTISED_Pause;
1311 if (advertise & OFPPF_PAUSE_ASYM) {
1312 ecmd.advertising |= ADVERTISED_Asym_Pause;
1314 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1315 ETHTOOL_SSET, "ETHTOOL_SSET");
1318 /* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1319 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1320 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1321 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1322 * sets '*vlan_vid' to -1. */
1324 netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1326 const char *netdev_name = netdev_get_name(netdev);
1327 struct ds line = DS_EMPTY_INITIALIZER;
1328 FILE *stream = NULL;
1332 COVERAGE_INC(netdev_get_vlan_vid);
1333 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1334 stream = fopen(fn, "r");
1340 if (ds_get_line(&line, stream)) {
1341 if (ferror(stream)) {
1343 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1346 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1351 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1353 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1354 fn, ds_cstr(&line));
1372 #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1373 #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1375 /* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1376 * positive errno value.
1378 * This function is equivalent to running
1379 * /sbin/tc qdisc del dev %s handle ffff: ingress
1380 * but it is much, much faster.
1383 netdev_linux_remove_policing(struct netdev *netdev)
1385 struct netdev_dev_linux *netdev_dev =
1386 netdev_dev_linux_cast(netdev_get_dev(netdev));
1387 const char *netdev_name = netdev_get_name(netdev);
1389 struct ofpbuf request;
1390 struct tcmsg *tcmsg;
1393 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1397 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1398 tcmsg->tcm_parent = TC_H_INGRESS;
1399 nl_msg_put_string(&request, TCA_KIND, "ingress");
1400 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1402 error = tc_transact(&request, NULL);
1403 if (error && error != ENOENT && error != EINVAL) {
1404 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1405 netdev_name, strerror(error));
1409 netdev_dev->kbits_rate = 0;
1410 netdev_dev->kbits_burst = 0;
1411 netdev_dev->cache_valid |= VALID_POLICING;
1415 /* Attempts to set input rate limiting (policing) policy. */
1417 netdev_linux_set_policing(struct netdev *netdev,
1418 uint32_t kbits_rate, uint32_t kbits_burst)
1420 struct netdev_dev_linux *netdev_dev =
1421 netdev_dev_linux_cast(netdev_get_dev(netdev));
1422 const char *netdev_name = netdev_get_name(netdev);
1425 COVERAGE_INC(netdev_set_policing);
1427 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1428 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1429 : kbits_burst); /* Stick with user-specified value. */
1431 if (netdev_dev->cache_valid & VALID_POLICING
1432 && netdev_dev->kbits_rate == kbits_rate
1433 && netdev_dev->kbits_burst == kbits_burst) {
1434 /* Assume that settings haven't changed since we last set them. */
1438 netdev_linux_remove_policing(netdev);
1440 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1441 if (system(command) != 0) {
1442 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1446 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1447 kbits_rate, kbits_burst);
1448 if (system(command) != 0) {
1449 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1454 netdev_dev->kbits_rate = kbits_rate;
1455 netdev_dev->kbits_burst = kbits_burst;
1456 netdev_dev->cache_valid |= VALID_POLICING;
1463 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1466 const struct tc_ops **opsp;
1468 for (opsp = tcs; *opsp != NULL; opsp++) {
1469 const struct tc_ops *ops = *opsp;
1470 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1471 svec_add(types, ops->ovs_name);
1477 static const struct tc_ops *
1478 tc_lookup_ovs_name(const char *name)
1480 const struct tc_ops **opsp;
1482 for (opsp = tcs; *opsp != NULL; opsp++) {
1483 const struct tc_ops *ops = *opsp;
1484 if (!strcmp(name, ops->ovs_name)) {
1491 static const struct tc_ops *
1492 tc_lookup_linux_name(const char *name)
1494 const struct tc_ops **opsp;
1496 for (opsp = tcs; *opsp != NULL; opsp++) {
1497 const struct tc_ops *ops = *opsp;
1498 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1505 static struct tc_queue *
1506 tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1509 struct netdev_dev_linux *netdev_dev =
1510 netdev_dev_linux_cast(netdev_get_dev(netdev));
1511 struct tc_queue *queue;
1513 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1514 if (queue->queue_id == queue_id) {
1521 static struct tc_queue *
1522 tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1524 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1528 netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1530 struct netdev_qos_capabilities *caps)
1532 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1536 caps->n_queues = ops->n_queues;
1541 netdev_linux_get_qos(const struct netdev *netdev,
1542 const char **typep, struct shash *details)
1544 struct netdev_dev_linux *netdev_dev =
1545 netdev_dev_linux_cast(netdev_get_dev(netdev));
1548 error = tc_query_qdisc(netdev);
1553 *typep = netdev_dev->tc->ops->ovs_name;
1554 return (netdev_dev->tc->ops->qdisc_get
1555 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1560 netdev_linux_set_qos(struct netdev *netdev,
1561 const char *type, const struct shash *details)
1563 struct netdev_dev_linux *netdev_dev =
1564 netdev_dev_linux_cast(netdev_get_dev(netdev));
1565 const struct tc_ops *new_ops;
1568 new_ops = tc_lookup_ovs_name(type);
1569 if (!new_ops || !new_ops->tc_install) {
1573 error = tc_query_qdisc(netdev);
1578 if (new_ops == netdev_dev->tc->ops) {
1579 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1581 /* Delete existing qdisc. */
1582 error = tc_del_qdisc(netdev);
1586 assert(netdev_dev->tc == NULL);
1588 /* Install new qdisc. */
1589 error = new_ops->tc_install(netdev, details);
1590 assert((error == 0) == (netdev_dev->tc != NULL));
1597 netdev_linux_get_queue(const struct netdev *netdev,
1598 unsigned int queue_id, struct shash *details)
1600 struct netdev_dev_linux *netdev_dev =
1601 netdev_dev_linux_cast(netdev_get_dev(netdev));
1604 error = tc_query_qdisc(netdev);
1608 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1610 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1616 netdev_linux_set_queue(struct netdev *netdev,
1617 unsigned int queue_id, const struct shash *details)
1619 struct netdev_dev_linux *netdev_dev =
1620 netdev_dev_linux_cast(netdev_get_dev(netdev));
1623 error = tc_query_qdisc(netdev);
1626 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1627 || !netdev_dev->tc->ops->class_set) {
1631 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1635 netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1637 struct netdev_dev_linux *netdev_dev =
1638 netdev_dev_linux_cast(netdev_get_dev(netdev));
1641 error = tc_query_qdisc(netdev);
1644 } else if (!netdev_dev->tc->ops->class_delete) {
1647 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1649 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1655 netdev_linux_get_queue_stats(const struct netdev *netdev,
1656 unsigned int queue_id,
1657 struct netdev_queue_stats *stats)
1659 struct netdev_dev_linux *netdev_dev =
1660 netdev_dev_linux_cast(netdev_get_dev(netdev));
1663 error = tc_query_qdisc(netdev);
1666 } else if (!netdev_dev->tc->ops->class_get_stats) {
1669 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1671 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1677 start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1679 struct ofpbuf request;
1680 struct tcmsg *tcmsg;
1682 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1686 tcmsg->tcm_parent = 0;
1687 nl_dump_start(dump, rtnl_sock, &request);
1688 ofpbuf_uninit(&request);
1693 netdev_linux_dump_queues(const struct netdev *netdev,
1694 netdev_dump_queues_cb *cb, void *aux)
1696 struct netdev_dev_linux *netdev_dev =
1697 netdev_dev_linux_cast(netdev_get_dev(netdev));
1698 struct tc_queue *queue;
1699 struct shash details;
1703 error = tc_query_qdisc(netdev);
1706 } else if (!netdev_dev->tc->ops->class_get) {
1711 shash_init(&details);
1712 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
1713 shash_clear(&details);
1715 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1717 (*cb)(queue->queue_id, &details, aux);
1722 shash_destroy(&details);
1728 netdev_linux_dump_queue_stats(const struct netdev *netdev,
1729 netdev_dump_queue_stats_cb *cb, void *aux)
1731 struct netdev_dev_linux *netdev_dev =
1732 netdev_dev_linux_cast(netdev_get_dev(netdev));
1733 struct nl_dump dump;
1738 error = tc_query_qdisc(netdev);
1741 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1746 if (!start_queue_dump(netdev, &dump)) {
1749 while (nl_dump_next(&dump, &msg)) {
1750 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1756 error = nl_dump_done(&dump);
1757 return error ? error : last_error;
1761 netdev_linux_get_in4(const struct netdev *netdev_,
1762 struct in_addr *address, struct in_addr *netmask)
1764 struct netdev_dev_linux *netdev_dev =
1765 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1767 if (!(netdev_dev->cache_valid & VALID_IN4)) {
1770 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1771 SIOCGIFADDR, "SIOCGIFADDR");
1776 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1777 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1782 netdev_dev->cache_valid |= VALID_IN4;
1784 *address = netdev_dev->address;
1785 *netmask = netdev_dev->netmask;
1786 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1790 netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1791 struct in_addr netmask)
1793 struct netdev_dev_linux *netdev_dev =
1794 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1797 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1799 netdev_dev->cache_valid |= VALID_IN4;
1800 netdev_dev->address = address;
1801 netdev_dev->netmask = netmask;
1802 if (address.s_addr != INADDR_ANY) {
1803 error = do_set_addr(netdev_, SIOCSIFNETMASK,
1804 "SIOCSIFNETMASK", netmask);
1811 parse_if_inet6_line(const char *line,
1812 struct in6_addr *in6, char ifname[16 + 1])
1814 uint8_t *s6 = in6->s6_addr;
1815 #define X8 "%2"SCNx8
1817 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1818 "%*x %*x %*x %*x %16s\n",
1819 &s6[0], &s6[1], &s6[2], &s6[3],
1820 &s6[4], &s6[5], &s6[6], &s6[7],
1821 &s6[8], &s6[9], &s6[10], &s6[11],
1822 &s6[12], &s6[13], &s6[14], &s6[15],
1826 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1827 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1829 netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1831 struct netdev_dev_linux *netdev_dev =
1832 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1833 if (!(netdev_dev->cache_valid & VALID_IN6)) {
1837 netdev_dev->in6 = in6addr_any;
1839 file = fopen("/proc/net/if_inet6", "r");
1841 const char *name = netdev_get_name(netdev_);
1842 while (fgets(line, sizeof line, file)) {
1843 struct in6_addr in6_tmp;
1844 char ifname[16 + 1];
1845 if (parse_if_inet6_line(line, &in6_tmp, ifname)
1846 && !strcmp(name, ifname))
1848 netdev_dev->in6 = in6_tmp;
1854 netdev_dev->cache_valid |= VALID_IN6;
1856 *in6 = netdev_dev->in6;
1861 make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1863 struct sockaddr_in sin;
1864 memset(&sin, 0, sizeof sin);
1865 sin.sin_family = AF_INET;
1866 sin.sin_addr = addr;
1869 memset(sa, 0, sizeof *sa);
1870 memcpy(sa, &sin, sizeof sin);
1874 do_set_addr(struct netdev *netdev,
1875 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1878 strncpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
1879 make_in4_sockaddr(&ifr.ifr_addr, addr);
1881 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1885 /* Adds 'router' as a default IP gateway. */
1887 netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
1889 struct in_addr any = { INADDR_ANY };
1893 memset(&rt, 0, sizeof rt);
1894 make_in4_sockaddr(&rt.rt_dst, any);
1895 make_in4_sockaddr(&rt.rt_gateway, router);
1896 make_in4_sockaddr(&rt.rt_genmask, any);
1897 rt.rt_flags = RTF_UP | RTF_GATEWAY;
1898 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1900 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1906 netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1909 static const char fn[] = "/proc/net/route";
1914 *netdev_name = NULL;
1915 stream = fopen(fn, "r");
1916 if (stream == NULL) {
1917 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1922 while (fgets(line, sizeof line, stream)) {
1925 uint32_t dest, gateway, mask;
1926 int refcnt, metric, mtu;
1927 unsigned int flags, use, window, irtt;
1930 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1932 iface, &dest, &gateway, &flags, &refcnt,
1933 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1935 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
1939 if (!(flags & RTF_UP)) {
1940 /* Skip routes that aren't up. */
1944 /* The output of 'dest', 'mask', and 'gateway' were given in
1945 * network byte order, so we don't need need any endian
1946 * conversions here. */
1947 if ((dest & mask) == (host->s_addr & mask)) {
1949 /* The host is directly reachable. */
1950 next_hop->s_addr = 0;
1952 /* To reach the host, we must go through a gateway. */
1953 next_hop->s_addr = gateway;
1955 *netdev_name = xstrdup(iface);
1966 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
1967 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
1968 * returns 0. Otherwise, it returns a positive errno value; in particular,
1969 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
1971 netdev_linux_arp_lookup(const struct netdev *netdev,
1972 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
1975 struct sockaddr_in sin;
1978 memset(&r, 0, sizeof r);
1979 sin.sin_family = AF_INET;
1980 sin.sin_addr.s_addr = ip;
1982 memcpy(&r.arp_pa, &sin, sizeof sin);
1983 r.arp_ha.sa_family = ARPHRD_ETHER;
1985 strncpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
1986 COVERAGE_INC(netdev_arp_lookup);
1987 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
1989 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
1990 } else if (retval != ENXIO) {
1991 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
1992 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
1998 nd_to_iff_flags(enum netdev_flags nd)
2001 if (nd & NETDEV_UP) {
2004 if (nd & NETDEV_PROMISC) {
2011 iff_to_nd_flags(int iff)
2013 enum netdev_flags nd = 0;
2017 if (iff & IFF_PROMISC) {
2018 nd |= NETDEV_PROMISC;
2024 netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2025 enum netdev_flags on, enum netdev_flags *old_flagsp)
2027 int old_flags, new_flags;
2030 error = get_flags(netdev, &old_flags);
2032 *old_flagsp = iff_to_nd_flags(old_flags);
2033 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2034 if (new_flags != old_flags) {
2035 error = set_flags(netdev, new_flags);
2042 poll_notify(struct list *list)
2044 struct netdev_linux_notifier *notifier;
2045 LIST_FOR_EACH (notifier, node, list) {
2046 struct netdev_notifier *n = ¬ifier->notifier;
2052 netdev_linux_poll_cb(const struct rtnetlink_change *change,
2053 void *aux OVS_UNUSED)
2056 struct list *list = shash_find_data(&netdev_linux_notifiers,
2062 struct shash_node *node;
2063 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2064 poll_notify(node->data);
2070 netdev_linux_poll_add(struct netdev *netdev,
2071 void (*cb)(struct netdev_notifier *), void *aux,
2072 struct netdev_notifier **notifierp)
2074 const char *netdev_name = netdev_get_name(netdev);
2075 struct netdev_linux_notifier *notifier;
2078 if (shash_is_empty(&netdev_linux_notifiers)) {
2079 int error = rtnetlink_notifier_register(&netdev_linux_poll_notifier,
2080 netdev_linux_poll_cb, NULL);
2086 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2088 list = xmalloc(sizeof *list);
2090 shash_add(&netdev_linux_notifiers, netdev_name, list);
2093 notifier = xmalloc(sizeof *notifier);
2094 netdev_notifier_init(¬ifier->notifier, netdev, cb, aux);
2095 list_push_back(list, ¬ifier->node);
2096 *notifierp = ¬ifier->notifier;
2101 netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2103 struct netdev_linux_notifier *notifier =
2104 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2107 /* Remove 'notifier' from its list. */
2108 list = list_remove(¬ifier->node);
2109 if (list_is_empty(list)) {
2110 /* The list is now empty. Remove it from the hash and free it. */
2111 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2112 shash_delete(&netdev_linux_notifiers,
2113 shash_find(&netdev_linux_notifiers, netdev_name));
2118 /* If that was the last notifier, unregister. */
2119 if (shash_is_empty(&netdev_linux_notifiers)) {
2120 rtnetlink_notifier_unregister(&netdev_linux_poll_notifier);
2124 #define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2128 netdev_linux_init, \
2130 netdev_linux_wait, \
2133 netdev_linux_destroy, \
2134 NULL, /* reconfigure */ \
2136 netdev_linux_open, \
2137 netdev_linux_close, \
2141 netdev_linux_recv, \
2142 netdev_linux_recv_wait, \
2143 netdev_linux_drain, \
2145 netdev_linux_send, \
2146 netdev_linux_send_wait, \
2148 netdev_linux_set_etheraddr, \
2149 netdev_linux_get_etheraddr, \
2150 netdev_linux_get_mtu, \
2151 netdev_linux_get_ifindex, \
2152 netdev_linux_get_carrier, \
2153 netdev_linux_get_stats, \
2156 netdev_linux_get_features, \
2157 netdev_linux_set_advertisements, \
2158 netdev_linux_get_vlan_vid, \
2160 netdev_linux_set_policing, \
2161 netdev_linux_get_qos_types, \
2162 netdev_linux_get_qos_capabilities, \
2163 netdev_linux_get_qos, \
2164 netdev_linux_set_qos, \
2165 netdev_linux_get_queue, \
2166 netdev_linux_set_queue, \
2167 netdev_linux_delete_queue, \
2168 netdev_linux_get_queue_stats, \
2169 netdev_linux_dump_queues, \
2170 netdev_linux_dump_queue_stats, \
2172 netdev_linux_get_in4, \
2173 netdev_linux_set_in4, \
2174 netdev_linux_get_in6, \
2175 netdev_linux_add_router, \
2176 netdev_linux_get_next_hop, \
2177 netdev_linux_arp_lookup, \
2179 netdev_linux_update_flags, \
2181 netdev_linux_poll_add, \
2182 netdev_linux_poll_remove \
2185 const struct netdev_class netdev_linux_class =
2188 netdev_linux_create,
2189 netdev_linux_enumerate,
2190 NULL); /* set_stats */
2192 const struct netdev_class netdev_tap_class =
2195 netdev_linux_create_tap,
2196 NULL, /* enumerate */
2197 NULL); /* set_stats */
2199 const struct netdev_class netdev_internal_class =
2202 netdev_linux_create,
2203 NULL, /* enumerate */
2204 netdev_vport_set_stats);
2206 /* HTB traffic control class. */
2208 #define HTB_N_QUEUES 0xf000
2212 unsigned int max_rate; /* In bytes/s. */
2216 struct tc_queue tc_queue;
2217 unsigned int min_rate; /* In bytes/s. */
2218 unsigned int max_rate; /* In bytes/s. */
2219 unsigned int burst; /* In bytes. */
2220 unsigned int priority; /* Lower values are higher priorities. */
2224 htb_get__(const struct netdev *netdev)
2226 struct netdev_dev_linux *netdev_dev =
2227 netdev_dev_linux_cast(netdev_get_dev(netdev));
2228 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2232 htb_install__(struct netdev *netdev, uint64_t max_rate)
2234 struct netdev_dev_linux *netdev_dev =
2235 netdev_dev_linux_cast(netdev_get_dev(netdev));
2238 htb = xmalloc(sizeof *htb);
2239 tc_init(&htb->tc, &tc_ops_htb);
2240 htb->max_rate = max_rate;
2242 netdev_dev->tc = &htb->tc;
2247 /* Create an HTB qdisc.
2249 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2251 htb_setup_qdisc__(struct netdev *netdev)
2254 struct tc_htb_glob opt;
2255 struct ofpbuf request;
2256 struct tcmsg *tcmsg;
2258 tc_del_qdisc(netdev);
2260 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2261 NLM_F_EXCL | NLM_F_CREATE, &request);
2265 tcmsg->tcm_handle = tc_make_handle(1, 0);
2266 tcmsg->tcm_parent = TC_H_ROOT;
2268 nl_msg_put_string(&request, TCA_KIND, "htb");
2270 memset(&opt, 0, sizeof opt);
2271 opt.rate2quantum = 10;
2275 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2276 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2277 nl_msg_end_nested(&request, opt_offset);
2279 return tc_transact(&request, NULL);
2282 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2283 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2285 htb_setup_class__(struct netdev *netdev, unsigned int handle,
2286 unsigned int parent, struct htb_class *class)
2289 struct tc_htb_opt opt;
2290 struct ofpbuf request;
2291 struct tcmsg *tcmsg;
2295 netdev_get_mtu(netdev, &mtu);
2297 memset(&opt, 0, sizeof opt);
2298 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2299 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2300 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2301 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2302 opt.prio = class->priority;
2304 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2308 tcmsg->tcm_handle = handle;
2309 tcmsg->tcm_parent = parent;
2311 nl_msg_put_string(&request, TCA_KIND, "htb");
2312 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2313 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2314 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2315 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2316 nl_msg_end_nested(&request, opt_offset);
2318 error = tc_transact(&request, NULL);
2320 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2321 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2322 netdev_get_name(netdev),
2323 tc_get_major(handle), tc_get_minor(handle),
2324 tc_get_major(parent), tc_get_minor(parent),
2325 class->min_rate, class->max_rate,
2326 class->burst, class->priority, strerror(error));
2331 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2332 * description of them into 'details'. The description complies with the
2333 * specification given in the vswitch database documentation for linux-htb
2336 htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2338 static const struct nl_policy tca_htb_policy[] = {
2339 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2340 .min_len = sizeof(struct tc_htb_opt) },
2343 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2344 const struct tc_htb_opt *htb;
2346 if (!nl_parse_nested(nl_options, tca_htb_policy,
2347 attrs, ARRAY_SIZE(tca_htb_policy))) {
2348 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2352 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2353 class->min_rate = htb->rate.rate;
2354 class->max_rate = htb->ceil.rate;
2355 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2356 class->priority = htb->prio;
2361 htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2362 struct htb_class *options,
2363 struct netdev_queue_stats *stats)
2365 struct nlattr *nl_options;
2366 unsigned int handle;
2369 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2370 if (!error && queue_id) {
2371 unsigned int major = tc_get_major(handle);
2372 unsigned int minor = tc_get_minor(handle);
2373 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2374 *queue_id = minor - 1;
2379 if (!error && options) {
2380 error = htb_parse_tca_options__(nl_options, options);
2386 htb_parse_qdisc_details__(struct netdev *netdev,
2387 const struct shash *details, struct htb_class *hc)
2389 const char *max_rate_s;
2391 max_rate_s = shash_find_data(details, "max-rate");
2392 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2393 if (!hc->max_rate) {
2396 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2397 hc->max_rate = netdev_features_to_bps(current) / 8;
2399 hc->min_rate = hc->max_rate;
2405 htb_parse_class_details__(struct netdev *netdev,
2406 const struct shash *details, struct htb_class *hc)
2408 const struct htb *htb = htb_get__(netdev);
2409 const char *min_rate_s = shash_find_data(details, "min-rate");
2410 const char *max_rate_s = shash_find_data(details, "max-rate");
2411 const char *burst_s = shash_find_data(details, "burst");
2412 const char *priority_s = shash_find_data(details, "priority");
2415 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
2417 /* min-rate is required. */
2420 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2421 hc->min_rate = MAX(hc->min_rate, 1500);
2422 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2425 hc->max_rate = (max_rate_s
2426 ? strtoull(max_rate_s, NULL, 10) / 8
2428 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2429 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2433 * According to hints in the documentation that I've read, it is important
2434 * that 'burst' be at least as big as the largest frame that might be
2435 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2436 * but having it a bit too small is a problem. Since netdev_get_mtu()
2437 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2438 * the MTU. We actually add 64, instead of 14, as a guard against
2439 * additional headers get tacked on somewhere that we're not aware of. */
2440 netdev_get_mtu(netdev, &mtu);
2441 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2442 hc->burst = MAX(hc->burst, mtu + 64);
2445 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2451 htb_query_class__(const struct netdev *netdev, unsigned int handle,
2452 unsigned int parent, struct htb_class *options,
2453 struct netdev_queue_stats *stats)
2455 struct ofpbuf *reply;
2458 error = tc_query_class(netdev, handle, parent, &reply);
2460 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2461 ofpbuf_delete(reply);
2467 htb_tc_install(struct netdev *netdev, const struct shash *details)
2471 error = htb_setup_qdisc__(netdev);
2473 struct htb_class hc;
2475 htb_parse_qdisc_details__(netdev, details, &hc);
2476 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2477 tc_make_handle(1, 0), &hc);
2479 htb_install__(netdev, hc.max_rate);
2485 static struct htb_class *
2486 htb_class_cast__(const struct tc_queue *queue)
2488 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2492 htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2493 const struct htb_class *hc)
2495 struct htb *htb = htb_get__(netdev);
2496 size_t hash = hash_int(queue_id, 0);
2497 struct tc_queue *queue;
2498 struct htb_class *hcp;
2500 queue = tc_find_queue__(netdev, queue_id, hash);
2502 hcp = htb_class_cast__(queue);
2504 hcp = xmalloc(sizeof *hcp);
2505 queue = &hcp->tc_queue;
2506 queue->queue_id = queue_id;
2507 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2510 hcp->min_rate = hc->min_rate;
2511 hcp->max_rate = hc->max_rate;
2512 hcp->burst = hc->burst;
2513 hcp->priority = hc->priority;
2517 htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2520 struct nl_dump dump;
2521 struct htb_class hc;
2524 /* Get qdisc options. */
2526 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2527 htb = htb_install__(netdev, hc.max_rate);
2530 if (!start_queue_dump(netdev, &dump)) {
2533 while (nl_dump_next(&dump, &msg)) {
2534 unsigned int queue_id;
2536 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2537 htb_update_queue__(netdev, queue_id, &hc);
2540 nl_dump_done(&dump);
2546 htb_tc_destroy(struct tc *tc)
2548 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2549 struct htb_class *hc, *next;
2551 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2552 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2560 htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2562 const struct htb *htb = htb_get__(netdev);
2563 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2568 htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2570 struct htb_class hc;
2573 htb_parse_qdisc_details__(netdev, details, &hc);
2574 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2575 tc_make_handle(1, 0), &hc);
2577 htb_get__(netdev)->max_rate = hc.max_rate;
2583 htb_class_get(const struct netdev *netdev OVS_UNUSED,
2584 const struct tc_queue *queue, struct shash *details)
2586 const struct htb_class *hc = htb_class_cast__(queue);
2588 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2589 if (hc->min_rate != hc->max_rate) {
2590 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2592 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2594 shash_add(details, "priority", xasprintf("%u", hc->priority));
2600 htb_class_set(struct netdev *netdev, unsigned int queue_id,
2601 const struct shash *details)
2603 struct htb_class hc;
2606 error = htb_parse_class_details__(netdev, details, &hc);
2611 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2612 tc_make_handle(1, 0xfffe), &hc);
2617 htb_update_queue__(netdev, queue_id, &hc);
2622 htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2624 struct htb_class *hc = htb_class_cast__(queue);
2625 struct htb *htb = htb_get__(netdev);
2628 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2630 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2637 htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2638 struct netdev_queue_stats *stats)
2640 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2641 tc_make_handle(1, 0xfffe), NULL, stats);
2645 htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2646 const struct ofpbuf *nlmsg,
2647 netdev_dump_queue_stats_cb *cb, void *aux)
2649 struct netdev_queue_stats stats;
2650 unsigned int handle, major, minor;
2653 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2658 major = tc_get_major(handle);
2659 minor = tc_get_minor(handle);
2660 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2661 (*cb)(minor - 1, &stats, aux);
2666 static const struct tc_ops tc_ops_htb = {
2667 "htb", /* linux_name */
2668 "linux-htb", /* ovs_name */
2669 HTB_N_QUEUES, /* n_queues */
2678 htb_class_get_stats,
2679 htb_class_dump_stats
2682 /* "linux-hfsc" traffic control class. */
2684 #define HFSC_N_QUEUES 0xf000
2692 struct tc_queue tc_queue;
2697 static struct hfsc *
2698 hfsc_get__(const struct netdev *netdev)
2700 struct netdev_dev_linux *netdev_dev;
2701 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2702 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2705 static struct hfsc_class *
2706 hfsc_class_cast__(const struct tc_queue *queue)
2708 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2711 static struct hfsc *
2712 hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2714 struct netdev_dev_linux * netdev_dev;
2717 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2718 hfsc = xmalloc(sizeof *hfsc);
2719 tc_init(&hfsc->tc, &tc_ops_hfsc);
2720 hfsc->max_rate = max_rate;
2721 netdev_dev->tc = &hfsc->tc;
2727 hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2728 const struct hfsc_class *hc)
2732 struct hfsc_class *hcp;
2733 struct tc_queue *queue;
2735 hfsc = hfsc_get__(netdev);
2736 hash = hash_int(queue_id, 0);
2738 queue = tc_find_queue__(netdev, queue_id, hash);
2740 hcp = hfsc_class_cast__(queue);
2742 hcp = xmalloc(sizeof *hcp);
2743 queue = &hcp->tc_queue;
2744 queue->queue_id = queue_id;
2745 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2748 hcp->min_rate = hc->min_rate;
2749 hcp->max_rate = hc->max_rate;
2753 hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2755 const struct tc_service_curve *rsc, *fsc, *usc;
2756 static const struct nl_policy tca_hfsc_policy[] = {
2758 .type = NL_A_UNSPEC,
2760 .min_len = sizeof(struct tc_service_curve),
2763 .type = NL_A_UNSPEC,
2765 .min_len = sizeof(struct tc_service_curve),
2768 .type = NL_A_UNSPEC,
2770 .min_len = sizeof(struct tc_service_curve),
2773 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2775 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2776 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2777 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2781 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2782 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2783 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2785 if (rsc->m1 != 0 || rsc->d != 0 ||
2786 fsc->m1 != 0 || fsc->d != 0 ||
2787 usc->m1 != 0 || usc->d != 0) {
2788 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2789 "Non-linear service curves are not supported.");
2793 if (rsc->m2 != fsc->m2) {
2794 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2795 "Real-time service curves are not supported ");
2799 if (rsc->m2 > usc->m2) {
2800 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2801 "Min-rate service curve is greater than "
2802 "the max-rate service curve.");
2806 class->min_rate = fsc->m2;
2807 class->max_rate = usc->m2;
2812 hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2813 struct hfsc_class *options,
2814 struct netdev_queue_stats *stats)
2817 unsigned int handle;
2818 struct nlattr *nl_options;
2820 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2826 unsigned int major, minor;
2828 major = tc_get_major(handle);
2829 minor = tc_get_minor(handle);
2830 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2831 *queue_id = minor - 1;
2838 error = hfsc_parse_tca_options__(nl_options, options);
2845 hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2846 unsigned int parent, struct hfsc_class *options,
2847 struct netdev_queue_stats *stats)
2850 struct ofpbuf *reply;
2852 error = tc_query_class(netdev, handle, parent, &reply);
2857 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2858 ofpbuf_delete(reply);
2863 hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2864 struct hfsc_class *class)
2867 const char *max_rate_s;
2869 max_rate_s = shash_find_data(details, "max-rate");
2870 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2875 netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2876 max_rate = netdev_features_to_bps(current) / 8;
2879 class->min_rate = max_rate;
2880 class->max_rate = max_rate;
2884 hfsc_parse_class_details__(struct netdev *netdev,
2885 const struct shash *details,
2886 struct hfsc_class * class)
2888 const struct hfsc *hfsc;
2889 uint32_t min_rate, max_rate;
2890 const char *min_rate_s, *max_rate_s;
2892 hfsc = hfsc_get__(netdev);
2893 min_rate_s = shash_find_data(details, "min-rate");
2894 max_rate_s = shash_find_data(details, "max-rate");
2900 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2901 min_rate = MAX(min_rate, 1500);
2902 min_rate = MIN(min_rate, hfsc->max_rate);
2904 max_rate = (max_rate_s
2905 ? strtoull(max_rate_s, NULL, 10) / 8
2907 max_rate = MAX(max_rate, min_rate);
2908 max_rate = MIN(max_rate, hfsc->max_rate);
2910 class->min_rate = min_rate;
2911 class->max_rate = max_rate;
2916 /* Create an HFSC qdisc.
2918 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2920 hfsc_setup_qdisc__(struct netdev * netdev)
2922 struct tcmsg *tcmsg;
2923 struct ofpbuf request;
2924 struct tc_hfsc_qopt opt;
2926 tc_del_qdisc(netdev);
2928 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2929 NLM_F_EXCL | NLM_F_CREATE, &request);
2935 tcmsg->tcm_handle = tc_make_handle(1, 0);
2936 tcmsg->tcm_parent = TC_H_ROOT;
2938 memset(&opt, 0, sizeof opt);
2941 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2942 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
2944 return tc_transact(&request, NULL);
2947 /* Create an HFSC class.
2949 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
2950 * sc rate <min_rate> ul rate <max_rate>" */
2952 hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
2953 unsigned int parent, struct hfsc_class *class)
2957 struct tcmsg *tcmsg;
2958 struct ofpbuf request;
2959 struct tc_service_curve min, max;
2961 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2967 tcmsg->tcm_handle = handle;
2968 tcmsg->tcm_parent = parent;
2972 min.m2 = class->min_rate;
2976 max.m2 = class->max_rate;
2978 nl_msg_put_string(&request, TCA_KIND, "hfsc");
2979 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2980 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
2981 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
2982 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
2983 nl_msg_end_nested(&request, opt_offset);
2985 error = tc_transact(&request, NULL);
2987 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2988 "min-rate %ubps, max-rate %ubps (%s)",
2989 netdev_get_name(netdev),
2990 tc_get_major(handle), tc_get_minor(handle),
2991 tc_get_major(parent), tc_get_minor(parent),
2992 class->min_rate, class->max_rate, strerror(error));
2999 hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3002 struct hfsc_class class;
3004 error = hfsc_setup_qdisc__(netdev);
3010 hfsc_parse_qdisc_details__(netdev, details, &class);
3011 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3012 tc_make_handle(1, 0), &class);
3018 hfsc_install__(netdev, class.max_rate);
3023 hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3027 struct nl_dump dump;
3028 struct hfsc_class hc;
3031 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3032 hfsc = hfsc_install__(netdev, hc.max_rate);
3034 if (!start_queue_dump(netdev, &dump)) {
3038 while (nl_dump_next(&dump, &msg)) {
3039 unsigned int queue_id;
3041 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3042 hfsc_update_queue__(netdev, queue_id, &hc);
3046 nl_dump_done(&dump);
3051 hfsc_tc_destroy(struct tc *tc)
3054 struct hfsc_class *hc, *next;
3056 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3058 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3059 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3068 hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3070 const struct hfsc *hfsc;
3071 hfsc = hfsc_get__(netdev);
3072 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3077 hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3080 struct hfsc_class class;
3082 hfsc_parse_qdisc_details__(netdev, details, &class);
3083 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3084 tc_make_handle(1, 0), &class);
3087 hfsc_get__(netdev)->max_rate = class.max_rate;
3094 hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3095 const struct tc_queue *queue, struct shash *details)
3097 const struct hfsc_class *hc;
3099 hc = hfsc_class_cast__(queue);
3100 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3101 if (hc->min_rate != hc->max_rate) {
3102 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3108 hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3109 const struct shash *details)
3112 struct hfsc_class class;
3114 error = hfsc_parse_class_details__(netdev, details, &class);
3119 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3120 tc_make_handle(1, 0xfffe), &class);
3125 hfsc_update_queue__(netdev, queue_id, &class);
3130 hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3134 struct hfsc_class *hc;
3136 hc = hfsc_class_cast__(queue);
3137 hfsc = hfsc_get__(netdev);
3139 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3141 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3148 hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3149 struct netdev_queue_stats *stats)
3151 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3152 tc_make_handle(1, 0xfffe), NULL, stats);
3156 hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3157 const struct ofpbuf *nlmsg,
3158 netdev_dump_queue_stats_cb *cb, void *aux)
3160 struct netdev_queue_stats stats;
3161 unsigned int handle, major, minor;
3164 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3169 major = tc_get_major(handle);
3170 minor = tc_get_minor(handle);
3171 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3172 (*cb)(minor - 1, &stats, aux);
3177 static const struct tc_ops tc_ops_hfsc = {
3178 "hfsc", /* linux_name */
3179 "linux-hfsc", /* ovs_name */
3180 HFSC_N_QUEUES, /* n_queues */
3181 hfsc_tc_install, /* tc_install */
3182 hfsc_tc_load, /* tc_load */
3183 hfsc_tc_destroy, /* tc_destroy */
3184 hfsc_qdisc_get, /* qdisc_get */
3185 hfsc_qdisc_set, /* qdisc_set */
3186 hfsc_class_get, /* class_get */
3187 hfsc_class_set, /* class_set */
3188 hfsc_class_delete, /* class_delete */
3189 hfsc_class_get_stats, /* class_get_stats */
3190 hfsc_class_dump_stats /* class_dump_stats */
3193 /* "linux-default" traffic control class.
3195 * This class represents the default, unnamed Linux qdisc. It corresponds to
3196 * the "" (empty string) QoS type in the OVS database. */
3199 default_install__(struct netdev *netdev)
3201 struct netdev_dev_linux *netdev_dev =
3202 netdev_dev_linux_cast(netdev_get_dev(netdev));
3203 static struct tc *tc;
3206 tc = xmalloc(sizeof *tc);
3207 tc_init(tc, &tc_ops_default);
3209 netdev_dev->tc = tc;
3213 default_tc_install(struct netdev *netdev,
3214 const struct shash *details OVS_UNUSED)
3216 default_install__(netdev);
3221 default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3223 default_install__(netdev);
3227 static const struct tc_ops tc_ops_default = {
3228 NULL, /* linux_name */
3233 NULL, /* tc_destroy */
3234 NULL, /* qdisc_get */
3235 NULL, /* qdisc_set */
3236 NULL, /* class_get */
3237 NULL, /* class_set */
3238 NULL, /* class_delete */
3239 NULL, /* class_get_stats */
3240 NULL /* class_dump_stats */
3243 /* "linux-other" traffic control class.
3248 other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3250 struct netdev_dev_linux *netdev_dev =
3251 netdev_dev_linux_cast(netdev_get_dev(netdev));
3252 static struct tc *tc;
3255 tc = xmalloc(sizeof *tc);
3256 tc_init(tc, &tc_ops_other);
3258 netdev_dev->tc = tc;
3262 static const struct tc_ops tc_ops_other = {
3263 NULL, /* linux_name */
3264 "linux-other", /* ovs_name */
3266 NULL, /* tc_install */
3268 NULL, /* tc_destroy */
3269 NULL, /* qdisc_get */
3270 NULL, /* qdisc_set */
3271 NULL, /* class_get */
3272 NULL, /* class_set */
3273 NULL, /* class_delete */
3274 NULL, /* class_get_stats */
3275 NULL /* class_dump_stats */
3278 /* Traffic control. */
3280 /* Number of kernel "tc" ticks per second. */
3281 static double ticks_per_s;
3283 /* Number of kernel "jiffies" per second. This is used for the purpose of
3284 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3285 * one jiffy's worth of data.
3287 * There are two possibilities here:
3289 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3290 * approximate range of 100 to 1024. That means that we really need to
3291 * make sure that the qdisc can buffer that much data.
3293 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3294 * has finely granular timers and there's no need to fudge additional room
3295 * for buffers. (There's no extra effort needed to implement that: the
3296 * large 'buffer_hz' is used as a divisor, so practically any number will
3297 * come out as 0 in the division. Small integer results in the case of
3298 * really high dividends won't have any real effect anyhow.)
3300 static unsigned int buffer_hz;
3302 /* Returns tc handle 'major':'minor'. */
3304 tc_make_handle(unsigned int major, unsigned int minor)
3306 return TC_H_MAKE(major << 16, minor);
3309 /* Returns the major number from 'handle'. */
3311 tc_get_major(unsigned int handle)
3313 return TC_H_MAJ(handle) >> 16;
3316 /* Returns the minor number from 'handle'. */
3318 tc_get_minor(unsigned int handle)
3320 return TC_H_MIN(handle);
3323 static struct tcmsg *
3324 tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3325 struct ofpbuf *request)
3327 struct tcmsg *tcmsg;
3331 error = get_ifindex(netdev, &ifindex);
3336 ofpbuf_init(request, 512);
3337 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3338 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3339 tcmsg->tcm_family = AF_UNSPEC;
3340 tcmsg->tcm_ifindex = ifindex;
3341 /* Caller should fill in tcmsg->tcm_handle. */
3342 /* Caller should fill in tcmsg->tcm_parent. */
3348 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3350 int error = nl_sock_transact(rtnl_sock, request, replyp);
3351 ofpbuf_uninit(request);
3358 /* The values in psched are not individually very meaningful, but they are
3359 * important. The tables below show some values seen in the wild.
3363 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3364 * (Before that, there are hints that it was 1000000000.)
3366 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3370 * -----------------------------------
3371 * [1] 000c8000 000f4240 000f4240 00000064
3372 * [2] 000003e8 00000400 000f4240 3b9aca00
3373 * [3] 000003e8 00000400 000f4240 3b9aca00
3374 * [4] 000003e8 00000400 000f4240 00000064
3375 * [5] 000003e8 00000040 000f4240 3b9aca00
3376 * [6] 000003e8 00000040 000f4240 000000f9
3378 * a b c d ticks_per_s buffer_hz
3379 * ------- --------- ---------- ------------- ----------- -------------
3380 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3381 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3382 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3383 * [4] 1,000 1,024 1,000,000 100 976,562 100
3384 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3385 * [6] 1,000 64 1,000,000 249 15,625,000 249
3387 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3388 * [2] 2.6.26-1-686-bigmem from Debian lenny
3389 * [3] 2.6.26-2-sparc64 from Debian lenny
3390 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3391 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3392 * [6] 2.6.34 from kernel.org on KVM
3394 static const char fn[] = "/proc/net/psched";
3395 unsigned int a, b, c, d;
3401 stream = fopen(fn, "r");
3403 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3407 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3408 VLOG_WARN("%s: read failed", fn);
3412 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3416 VLOG_WARN("%s: invalid scheduler parameters", fn);
3420 ticks_per_s = (double) a * c / b;
3424 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3427 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3430 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3431 * rate of 'rate' bytes per second. */
3433 tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3438 return (rate * ticks) / ticks_per_s;
3441 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
3442 * rate of 'rate' bytes per second. */
3444 tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3449 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3452 /* Returns the number of bytes that need to be reserved for qdisc buffering at
3453 * a transmission rate of 'rate' bytes per second. */
3455 tc_buffer_per_jiffy(unsigned int rate)
3460 return rate / buffer_hz;
3463 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3464 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3465 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3466 * stores NULL into it if it is absent.
3468 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3471 * Returns 0 if successful, otherwise a positive errno value. */
3473 tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3474 struct nlattr **options)
3476 static const struct nl_policy tca_policy[] = {
3477 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3478 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3480 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3482 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3483 tca_policy, ta, ARRAY_SIZE(ta))) {
3484 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3489 *kind = nl_attr_get_string(ta[TCA_KIND]);
3493 *options = ta[TCA_OPTIONS];
3508 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3509 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3510 * into '*options', and its queue statistics into '*stats'. Any of the output
3511 * arguments may be null.
3513 * Returns 0 if successful, otherwise a positive errno value. */
3515 tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3516 struct nlattr **options, struct netdev_queue_stats *stats)
3518 static const struct nl_policy tca_policy[] = {
3519 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3520 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3522 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3524 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3525 tca_policy, ta, ARRAY_SIZE(ta))) {
3526 VLOG_WARN_RL(&rl, "failed to parse class message");
3531 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3532 *handlep = tc->tcm_handle;
3536 *options = ta[TCA_OPTIONS];
3540 const struct gnet_stats_queue *gsq;
3541 struct gnet_stats_basic gsb;
3543 static const struct nl_policy stats_policy[] = {
3544 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3545 .min_len = sizeof gsb },
3546 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3547 .min_len = sizeof *gsq },
3549 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3551 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3552 sa, ARRAY_SIZE(sa))) {
3553 VLOG_WARN_RL(&rl, "failed to parse class stats");
3557 /* Alignment issues screw up the length of struct gnet_stats_basic on
3558 * some arch/bitsize combinations. Newer versions of Linux have a
3559 * struct gnet_stats_basic_packed, but we can't depend on that. The
3560 * easiest thing to do is just to make a copy. */
3561 memset(&gsb, 0, sizeof gsb);
3562 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3563 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3564 stats->tx_bytes = gsb.bytes;
3565 stats->tx_packets = gsb.packets;
3567 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3568 stats->tx_errors = gsq->drops;
3578 memset(stats, 0, sizeof *stats);
3583 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
3586 tc_query_class(const struct netdev *netdev,
3587 unsigned int handle, unsigned int parent,
3588 struct ofpbuf **replyp)
3590 struct ofpbuf request;
3591 struct tcmsg *tcmsg;
3594 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3598 tcmsg->tcm_handle = handle;
3599 tcmsg->tcm_parent = parent;
3601 error = tc_transact(&request, replyp);
3603 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3604 netdev_get_name(netdev),
3605 tc_get_major(handle), tc_get_minor(handle),
3606 tc_get_major(parent), tc_get_minor(parent),
3612 /* Equivalent to "tc class del dev <name> handle <handle>". */
3614 tc_delete_class(const struct netdev *netdev, unsigned int handle)
3616 struct ofpbuf request;
3617 struct tcmsg *tcmsg;
3620 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3624 tcmsg->tcm_handle = handle;
3625 tcmsg->tcm_parent = 0;
3627 error = tc_transact(&request, NULL);
3629 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3630 netdev_get_name(netdev),
3631 tc_get_major(handle), tc_get_minor(handle),
3637 /* Equivalent to "tc qdisc del dev <name> root". */
3639 tc_del_qdisc(struct netdev *netdev)
3641 struct netdev_dev_linux *netdev_dev =
3642 netdev_dev_linux_cast(netdev_get_dev(netdev));
3643 struct ofpbuf request;
3644 struct tcmsg *tcmsg;
3647 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3651 tcmsg->tcm_handle = tc_make_handle(1, 0);
3652 tcmsg->tcm_parent = TC_H_ROOT;
3654 error = tc_transact(&request, NULL);
3655 if (error == EINVAL) {
3656 /* EINVAL probably means that the default qdisc was in use, in which
3657 * case we've accomplished our purpose. */
3660 if (!error && netdev_dev->tc) {
3661 if (netdev_dev->tc->ops->tc_destroy) {
3662 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3664 netdev_dev->tc = NULL;
3669 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
3670 * kernel to determine what they are. Returns 0 if successful, otherwise a
3671 * positive errno value. */
3673 tc_query_qdisc(const struct netdev *netdev)
3675 struct netdev_dev_linux *netdev_dev =
3676 netdev_dev_linux_cast(netdev_get_dev(netdev));
3677 struct ofpbuf request, *qdisc;
3678 const struct tc_ops *ops;
3679 struct tcmsg *tcmsg;
3683 if (netdev_dev->tc) {
3687 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3688 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3689 * 2.6.35 without that fix backported to it.
3691 * To avoid the OOPS, we must not make a request that would attempt to dump
3692 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3693 * few others. There are a few ways that I can see to do this, but most of
3694 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3695 * technique chosen here is to assume that any non-default qdisc that we
3696 * create will have a class with handle 1:0. The built-in qdiscs only have
3697 * a class with handle 0:0.
3699 * We could check for Linux 2.6.35+ and use a more straightforward method
3701 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3705 tcmsg->tcm_handle = tc_make_handle(1, 0);
3706 tcmsg->tcm_parent = 0;
3708 /* Figure out what tc class to instantiate. */
3709 error = tc_transact(&request, &qdisc);
3713 error = tc_parse_qdisc(qdisc, &kind, NULL);
3715 ops = &tc_ops_other;
3717 ops = tc_lookup_linux_name(kind);
3719 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3720 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3722 ops = &tc_ops_other;
3725 } else if (error == ENOENT) {
3726 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3727 * other entity that doesn't have a handle 1:0. We will assume
3728 * that it's the system default qdisc. */
3729 ops = &tc_ops_default;
3732 /* Who knows? Maybe the device got deleted. */
3733 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3734 netdev_get_name(netdev), strerror(error));
3735 ops = &tc_ops_other;
3738 /* Instantiate it. */
3739 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3740 assert((load_error == 0) == (netdev_dev->tc != NULL));
3741 ofpbuf_delete(qdisc);
3743 return error ? error : load_error;
3746 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3747 approximate the time to transmit packets of various lengths. For an MTU of
3748 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3749 represents two possible packet lengths; for a MTU of 513 through 1024, four
3750 possible lengths; and so on.
3752 Returns, for the specified 'mtu', the number of bits that packet lengths
3753 need to be shifted right to fit within such a 256-entry table. */
3755 tc_calc_cell_log(unsigned int mtu)
3760 mtu = ETH_PAYLOAD_MAX;
3762 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3764 for (cell_log = 0; mtu >= 256; cell_log++) {
3771 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3774 tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3776 memset(rate, 0, sizeof *rate);
3777 rate->cell_log = tc_calc_cell_log(mtu);
3778 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3779 /* rate->cell_align = 0; */ /* distro headers. */
3780 rate->mpu = ETH_TOTAL_MIN;
3784 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3785 * attribute of the specified "type".
3787 * See tc_calc_cell_log() above for a description of "rtab"s. */
3789 tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3794 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3795 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3796 unsigned packet_size = (i + 1) << rate->cell_log;
3797 if (packet_size < rate->mpu) {
3798 packet_size = rate->mpu;
3800 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3804 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3805 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3806 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3809 tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3811 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3812 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3816 /* Utility functions. */
3819 get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3821 /* Policy for RTNLGRP_LINK messages.
3823 * There are *many* more fields in these messages, but currently we only
3824 * care about these fields. */
3825 static const struct nl_policy rtnlgrp_link_policy[] = {
3826 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3827 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3828 .min_len = sizeof(struct rtnl_link_stats) },
3831 struct ofpbuf request;
3832 struct ofpbuf *reply;
3833 struct ifinfomsg *ifi;
3834 const struct rtnl_link_stats *rtnl_stats;
3835 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3838 ofpbuf_init(&request, 0);
3839 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3840 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3841 ifi->ifi_family = PF_UNSPEC;
3842 ifi->ifi_index = ifindex;
3843 error = nl_sock_transact(rtnl_sock, &request, &reply);
3844 ofpbuf_uninit(&request);
3849 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3850 rtnlgrp_link_policy,
3851 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3852 ofpbuf_delete(reply);
3856 if (!attrs[IFLA_STATS]) {
3857 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3858 ofpbuf_delete(reply);
3862 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3863 stats->rx_packets = rtnl_stats->rx_packets;
3864 stats->tx_packets = rtnl_stats->tx_packets;
3865 stats->rx_bytes = rtnl_stats->rx_bytes;
3866 stats->tx_bytes = rtnl_stats->tx_bytes;
3867 stats->rx_errors = rtnl_stats->rx_errors;
3868 stats->tx_errors = rtnl_stats->tx_errors;
3869 stats->rx_dropped = rtnl_stats->rx_dropped;
3870 stats->tx_dropped = rtnl_stats->tx_dropped;
3871 stats->multicast = rtnl_stats->multicast;
3872 stats->collisions = rtnl_stats->collisions;
3873 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3874 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3875 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3876 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3877 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3878 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3879 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3880 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3881 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3882 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3883 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3885 ofpbuf_delete(reply);
3891 get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3893 static const char fn[] = "/proc/net/dev";
3898 stream = fopen(fn, "r");
3900 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3905 while (fgets(line, sizeof line, stream)) {
3908 #define X64 "%"SCNu64
3911 X64 X64 X64 X64 X64 X64 X64 "%*u"
3912 X64 X64 X64 X64 X64 X64 X64 "%*u",
3918 &stats->rx_fifo_errors,
3919 &stats->rx_frame_errors,
3925 &stats->tx_fifo_errors,
3927 &stats->tx_carrier_errors) != 15) {
3928 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
3929 } else if (!strcmp(devname, netdev_name)) {
3930 stats->rx_length_errors = UINT64_MAX;
3931 stats->rx_over_errors = UINT64_MAX;
3932 stats->rx_crc_errors = UINT64_MAX;
3933 stats->rx_missed_errors = UINT64_MAX;
3934 stats->tx_aborted_errors = UINT64_MAX;
3935 stats->tx_heartbeat_errors = UINT64_MAX;
3936 stats->tx_window_errors = UINT64_MAX;
3942 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
3948 get_flags(const struct netdev *netdev, int *flags)
3953 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
3955 *flags = ifr.ifr_flags;
3960 set_flags(struct netdev *netdev, int flags)
3964 ifr.ifr_flags = flags;
3965 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
3970 do_get_ifindex(const char *netdev_name)
3974 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
3975 COVERAGE_INC(netdev_get_ifindex);
3976 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
3977 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
3978 netdev_name, strerror(errno));
3981 return ifr.ifr_ifindex;
3985 get_ifindex(const struct netdev *netdev_, int *ifindexp)
3987 struct netdev_dev_linux *netdev_dev =
3988 netdev_dev_linux_cast(netdev_get_dev(netdev_));
3990 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
3991 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
3995 netdev_dev->cache_valid |= VALID_IFINDEX;
3996 netdev_dev->ifindex = ifindex;
3998 *ifindexp = netdev_dev->ifindex;
4003 get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4008 memset(&ifr, 0, sizeof ifr);
4009 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4010 COVERAGE_INC(netdev_get_hwaddr);
4011 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4012 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4013 netdev_name, strerror(errno));
4016 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4017 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4018 VLOG_WARN("%s device has unknown hardware address family %d",
4019 netdev_name, hwaddr_family);
4021 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4026 set_etheraddr(const char *netdev_name, int hwaddr_family,
4027 const uint8_t mac[ETH_ADDR_LEN])
4031 memset(&ifr, 0, sizeof ifr);
4032 strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4033 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4034 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4035 COVERAGE_INC(netdev_set_hwaddr);
4036 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4037 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4038 netdev_name, strerror(errno));
4045 netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4046 int cmd, const char *cmd_name)
4050 memset(&ifr, 0, sizeof ifr);
4051 strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4052 ifr.ifr_data = (caddr_t) ecmd;
4055 COVERAGE_INC(netdev_ethtool);
4056 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4059 if (errno != EOPNOTSUPP) {
4060 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4061 "failed: %s", cmd_name, name, strerror(errno));
4063 /* The device doesn't support this operation. That's pretty
4064 * common, so there's no point in logging anything. */
4071 netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4072 const char *cmd_name)
4074 strncpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4075 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4076 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4084 netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4085 int cmd, const char *cmd_name)
4090 ifr.ifr_addr.sa_family = AF_INET;
4091 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4093 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4094 *ip = sin->sin_addr;