#include <linux/ip.h>
#include <linux/types.h>
#include <linux/ethtool.h>
+#include <linux/pkt_sched.h>
#include <linux/rtnetlink.h>
#include <linux/sockios.h>
#include <linux/version.h>
VALID_IN6 = 1 << 3,
VALID_MTU = 1 << 4,
VALID_CARRIER = 1 << 5,
- VALID_IS_PSEUDO = 1 << 6 /* Represents is_internal and is_tap. */
+ VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
+ VALID_POLICING = 1 << 7
};
struct tap_state {
int fd;
-};
-
-struct patch_state {
- char *peer;
+ bool opened;
};
struct netdev_dev_linux {
int carrier;
bool is_internal; /* Is this an openvswitch internal device? */
bool is_tap; /* Is this a tuntap device? */
+ uint32_t kbits_rate; /* Policing data. */
+ uint32_t kbits_burst;
union {
struct tap_state tap;
- struct patch_state patch;
} state;
};
}
}
-static int
-if_up(const char *name)
-{
- struct ifreq ifr;
-
- strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
- ifr.ifr_flags = IFF_UP;
-
- if (ioctl(af_inet_sock, SIOCSIFFLAGS, &ifr) == -1) {
- VLOG_DBG_RL(&rl, "%s: failed to bring device up: %s",
- name, strerror(errno));
- return errno;
- }
-
- return 0;
-}
-
-/* A veth may be created using the 'command' "+<name>,<peer>". A veth may
- * be destroyed by using the 'command' "-<name>", where <name> can be
- * either side of the device.
- */
-static int
-modify_veth(const char *format, ...)
-{
- FILE *veth_file;
- va_list args;
- int retval;
-
- veth_file = fopen("/sys/class/net/veth_pairs", "w");
- if (!veth_file) {
- VLOG_WARN_RL(&rl, "could not open veth device. Are you running a "
- "supported XenServer with the kernel module loaded?");
- return ENODEV;
- }
- setvbuf(veth_file, NULL, _IONBF, 0);
-
- va_start(args, format);
- retval = vfprintf(veth_file, format, args);
- va_end(args);
-
- fclose(veth_file);
- if (retval < 0) {
- VLOG_WARN_RL(&rl, "could not destroy patch: %s", strerror(errno));
- return errno;
- }
-
- return 0;
-}
-
-static int
-create_patch(const char *name, const char *peer)
-{
- int retval;
- struct netdev_dev *peer_nd;
-
-
- /* Only create the veth if the peer didn't already do it. */
- peer_nd = netdev_dev_from_name(peer);
- if (peer_nd) {
- if (!strcmp("patch", netdev_dev_get_type(peer_nd))) {
- struct netdev_dev_linux *ndl = netdev_dev_linux_cast(peer_nd);
- if (!strcmp(name, ndl->state.patch.peer)) {
- return 0;
- } else {
- VLOG_WARN_RL(&rl, "peer '%s' already paired with '%s'",
- peer, ndl->state.patch.peer);
- return EINVAL;
- }
- } else {
- VLOG_WARN_RL(&rl, "peer '%s' exists and is not a patch", peer);
- return EINVAL;
- }
- }
-
- retval = modify_veth("+%s,%s", name, peer);
- if (retval) {
- return retval;
- }
-
- retval = if_up(name);
- if (retval) {
- return retval;
- }
-
- retval = if_up(peer);
- if (retval) {
- return retval;
- }
-
- return 0;
-}
-
-static int
-setup_patch(const char *name, const struct shash *args, char **peer_)
-{
- const char *peer;
-
- peer = shash_find_data(args, "peer");
- if (!peer) {
- VLOG_WARN("patch type requires valid 'peer' argument");
- return EINVAL;
- }
-
- if (shash_count(args) > 1) {
- VLOG_WARN("patch type takes only a 'peer' argument");
- return EINVAL;
- }
-
- if (strlen(peer) >= IFNAMSIZ) {
- VLOG_WARN_RL(&rl, "patch 'peer' arg too long");
- return EINVAL;
- }
-
- *peer_ = xstrdup(peer);
- return create_patch(name, peer);
-}
-
/* Creates the netdev device of 'type' with 'name'. */
static int
netdev_linux_create_system(const char *name, const char *type OVS_UNUSED,
return error;
}
-static int
-netdev_linux_create_patch(const char *name, const char *type OVS_UNUSED,
- const struct shash *args, struct netdev_dev **netdev_devp)
-{
- struct netdev_dev_linux *netdev_dev;
- char *peer = NULL;
- int error;
-
- error = setup_patch(name, args, &peer);
- if (error) {
- free(peer);
- return error;
- }
-
- netdev_dev = xzalloc(sizeof *netdev_dev);
- netdev_dev->state.patch.peer = peer;
- netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_patch_class);
- *netdev_devp = &netdev_dev->netdev_dev;
-
- return 0;
-}
-
static void
destroy_tap(struct netdev_dev_linux *netdev_dev)
{
}
}
-static void
-destroy_patch(struct netdev_dev_linux *netdev_dev)
-{
- const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
- struct patch_state *state = &netdev_dev->state.patch;
-
- /* Only destroy veth if 'peer' doesn't exist as an existing netdev. */
- if (!netdev_dev_from_name(state->peer)) {
- modify_veth("-%s", name);
- }
- free(state->peer);
-}
-
/* Destroys the netdev device 'netdev_dev_'. */
static void
netdev_linux_destroy(struct netdev_dev *netdev_dev_)
}
} else if (!strcmp(type, "tap")) {
destroy_tap(netdev_dev);
- } else if (!strcmp(type, "patch")) {
- destroy_patch(netdev_dev);
}
free(netdev_dev);
goto error;
}
- if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap")) {
+ if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
+ !netdev_dev->state.tap.opened) {
+
+ /* We assume that the first user of the tap device is the primary user
+ * and give them the tap FD. Subsequent users probably just expect
+ * this to be a system device so open it normally to avoid send/receive
+ * directions appearing to be reversed. */
netdev->fd = netdev_dev->state.tap.fd;
+ netdev_dev->state.tap.opened = true;
} else if (ethertype != NETDEV_ETH_TYPE_NONE) {
struct sockaddr_ll sll;
int protocol;
}
}
+static void
+swap_uint64(uint64_t *a, uint64_t *b)
+{
+ *a ^= *b;
+ *b ^= *a;
+ *a ^= *b;
+}
+
/* Retrieves current device stats for 'netdev'.
*
* XXX All of the members of struct netdev_stats are 64 bits wide, but on
netdev_dev_linux_cast(netdev_get_dev(netdev_));
static int use_netlink_stats = -1;
int error;
- struct netdev_stats raw_stats;
- struct netdev_stats *collect_stats = stats;
COVERAGE_INC(netdev_get_stats);
- netdev_linux_update_is_pseudo(netdev_dev);
- if (netdev_dev->is_internal) {
- collect_stats = &raw_stats;
- }
-
if (use_netlink_stats < 0) {
use_netlink_stats = check_for_working_netlink_stats();
}
error = get_ifindex(netdev_, &ifindex);
if (!error) {
- error = get_stats_via_netlink(ifindex, collect_stats);
+ error = get_stats_via_netlink(ifindex, stats);
}
} else {
- error = get_stats_via_proc(netdev_get_name(netdev_), collect_stats);
+ error = get_stats_via_proc(netdev_get_name(netdev_), stats);
}
/* If this port is an internal port then the transmit and receive stats
* will appear to be swapped relative to the other ports since we are the
* one sending the data, not a remote computer. For consistency, we swap
* them back here. */
+ netdev_linux_update_is_pseudo(netdev_dev);
if (!error && (netdev_dev->is_internal || netdev_dev->is_tap)) {
- stats->rx_packets = raw_stats.tx_packets;
- stats->tx_packets = raw_stats.rx_packets;
- stats->rx_bytes = raw_stats.tx_bytes;
- stats->tx_bytes = raw_stats.rx_bytes;
- stats->rx_errors = raw_stats.tx_errors;
- stats->tx_errors = raw_stats.rx_errors;
- stats->rx_dropped = raw_stats.tx_dropped;
- stats->tx_dropped = raw_stats.rx_dropped;
- stats->multicast = raw_stats.multicast;
- stats->collisions = raw_stats.collisions;
+ swap_uint64(&stats->rx_packets, &stats->tx_packets);
+ swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
+ swap_uint64(&stats->rx_errors, &stats->tx_errors);
+ swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
stats->rx_length_errors = 0;
stats->rx_over_errors = 0;
stats->rx_crc_errors = 0;
#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
-/* We redirect stderr to /dev/null because we often want to remove all
- * traffic control configuration on a port so its in a known state. If
- * this done when there is no such configuration, tc complains, so we just
- * always ignore it.
+
+/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
+ * positive errno value.
+ *
+ * This function is equivalent to running
+ * /sbin/tc qdisc del dev %s handle ffff: ingress
+ * but it is much, much faster.
*/
-#define POLICE_DEL_CMD "/sbin/tc qdisc del dev %s handle ffff: ingress 2>/dev/null"
+static int
+netdev_linux_remove_policing(struct netdev *netdev)
+{
+ struct netdev_dev_linux *netdev_dev =
+ netdev_dev_linux_cast(netdev_get_dev(netdev));
+ const char *netdev_name = netdev_get_name(netdev);
+
+ struct ofpbuf request;
+ struct ofpbuf *reply;
+ struct tcmsg *tcmsg;
+ struct nl_sock *rtnl_sock;
+ int ifindex;
+ int error;
+
+ error = get_ifindex(netdev, &ifindex);
+ if (error) {
+ return error;
+ }
+
+ error = get_rtnl_sock(&rtnl_sock);
+ if (error) {
+ return error;
+ }
+
+ ofpbuf_init(&request, 0);
+ nl_msg_put_nlmsghdr(&request, rtnl_sock, sizeof *tcmsg,
+ RTM_DELQDISC, NLM_F_REQUEST);
+ tcmsg = ofpbuf_put_zeros(&request, sizeof *tcmsg);
+ tcmsg->tcm_family = AF_UNSPEC;
+ tcmsg->tcm_ifindex = ifindex;
+ tcmsg->tcm_handle = 0xffff0000;
+ tcmsg->tcm_parent = TC_H_INGRESS;
+ nl_msg_put_string(&request, TCA_KIND, "ingress");
+ nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
+ error = nl_sock_transact(rtnl_sock, &request, &reply);
+ ofpbuf_uninit(&request);
+ ofpbuf_delete(reply);
+ if (error && error != ENOENT && error != EINVAL) {
+ VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
+ netdev_name, strerror(error));
+ return error;
+ }
+
+ netdev_dev->kbits_rate = 0;
+ netdev_dev->kbits_burst = 0;
+ netdev_dev->cache_valid |= VALID_POLICING;
+ return 0;
+}
/* Attempts to set input rate limiting (policing) policy. */
static int
netdev_linux_set_policing(struct netdev *netdev,
uint32_t kbits_rate, uint32_t kbits_burst)
{
+ struct netdev_dev_linux *netdev_dev =
+ netdev_dev_linux_cast(netdev_get_dev(netdev));
const char *netdev_name = netdev_get_name(netdev);
char command[1024];
COVERAGE_INC(netdev_set_policing);
- if (kbits_rate) {
- if (!kbits_burst) {
- /* Default to 1000 kilobits if not specified. */
- kbits_burst = 1000;
- }
- /* xxx This should be more careful about only adding if it
- * xxx actually exists, as opposed to always deleting it. */
- snprintf(command, sizeof(command), POLICE_DEL_CMD, netdev_name);
- if (system(command) == -1) {
- VLOG_WARN_RL(&rl, "%s: problem removing policing", netdev_name);
- }
+ kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
+ : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
+ : kbits_burst); /* Stick with user-specified value. */
+
+ if (netdev_dev->cache_valid & VALID_POLICING
+ && netdev_dev->kbits_rate == kbits_rate
+ && netdev_dev->kbits_burst == kbits_burst) {
+ /* Assume that settings haven't changed since we last set them. */
+ return 0;
+ }
+ netdev_linux_remove_policing(netdev);
+ if (kbits_rate) {
snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
if (system(command) != 0) {
VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
netdev_name);
return -1;
}
- } else {
- snprintf(command, sizeof(command), POLICE_DEL_CMD, netdev_name);
- if (system(command) == -1) {
- VLOG_WARN_RL(&rl, "%s: problem removing policing", netdev_name);
- }
+
+ netdev_dev->kbits_rate = kbits_rate;
+ netdev_dev->kbits_burst = kbits_burst;
+ netdev_dev->cache_valid |= VALID_POLICING;
}
return 0;
netdev_linux_poll_remove,
};
-const struct netdev_class netdev_patch_class = {
- "patch",
-
- netdev_linux_init,
- netdev_linux_run,
- netdev_linux_wait,
-
- netdev_linux_create_patch,
- netdev_linux_destroy,
- NULL, /* reconfigure */
-
- netdev_linux_open,
- netdev_linux_close,
-
- NULL, /* enumerate */
-
- netdev_linux_recv,
- netdev_linux_recv_wait,
- netdev_linux_drain,
-
- netdev_linux_send,
- netdev_linux_send_wait,
-
- netdev_linux_set_etheraddr,
- netdev_linux_get_etheraddr,
- netdev_linux_get_mtu,
- netdev_linux_get_ifindex,
- netdev_linux_get_carrier,
- netdev_linux_get_stats,
- NULL, /* set_stats */
-
- netdev_linux_get_features,
- netdev_linux_set_advertisements,
- netdev_linux_get_vlan_vid,
- netdev_linux_set_policing,
-
- netdev_linux_get_in4,
- netdev_linux_set_in4,
- netdev_linux_get_in6,
- netdev_linux_add_router,
- netdev_linux_get_next_hop,
- netdev_linux_arp_lookup,
-
- netdev_linux_update_flags,
-
- netdev_linux_poll_add,
- netdev_linux_poll_remove,
-};
-
\f
static int
get_stats_via_netlink(int ifindex, struct netdev_stats *stats)