X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=lib%2Fnetdev-linux.c;h=9a6d70a29dc578d0579df831b5d0220a840e787b;hb=3c442619836797b78bbb3472385a4982582cb907;hp=bcc332682f415b3c0ec31499f6eb9e6276fd2305;hpb=6f643e4946016399f0b217c2226284e3892b6267;p=openvswitch diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index bcc33268..9a6d70a2 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,8 @@ #include "netlink.h" #include "ofpbuf.h" #include "openflow/openflow.h" +#include "openvswitch/internal_dev.h" +#include "openvswitch/gre.h" #include "packets.h" #include "poll-loop.h" #include "rtnetlink.h" @@ -79,34 +82,35 @@ enum { VALID_IN6 = 1 << 3, VALID_MTU = 1 << 4, VALID_CARRIER = 1 << 5, - VALID_IS_INTERNAL = 1 << 6 + VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */ + VALID_POLICING = 1 << 7 }; struct tap_state { int fd; }; -struct patch_state { - char *peer; -}; - struct netdev_dev_linux { struct netdev_dev netdev_dev; struct shash_node *shash_node; unsigned int cache_valid; + /* The following are figured out "on demand" only. They are only valid + * when the corresponding VALID_* bit in 'cache_valid' is set. */ int ifindex; uint8_t etheraddr[ETH_ADDR_LEN]; struct in_addr address, netmask; struct in6_addr in6; int mtu; int carrier; - bool is_internal; + bool is_internal; /* Is this an openvswitch internal device? */ + bool is_tap; /* Is this a tuntap device? */ + uint32_t kbits_rate; /* Policing data. */ + uint32_t kbits_burst; union { struct tap_state tap; - struct patch_state patch; } state; }; @@ -151,6 +155,7 @@ static int set_etheraddr(const char *netdev_name, int hwaddr_family, const uint8_t[ETH_ADDR_LEN]); static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats); static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats); +static int get_rtnl_sock(struct nl_sock **); static bool is_netdev_linux_class(const struct netdev_class *netdev_class) @@ -233,123 +238,6 @@ netdev_linux_cache_cb(const struct rtnetlink_change *change, } } -static int -if_up(const char *name) -{ - struct ifreq ifr; - - strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name); - ifr.ifr_flags = IFF_UP; - - if (ioctl(af_inet_sock, SIOCSIFFLAGS, &ifr) == -1) { - VLOG_DBG_RL(&rl, "%s: failed to bring device up: %s", - name, strerror(errno)); - return errno; - } - - return 0; -} - -/* A veth may be created using the 'command' "+,". A veth may - * be destroyed by using the 'command' "-", where can be - * either side of the device. - */ -static int -modify_veth(const char *format, ...) -{ - FILE *veth_file; - va_list args; - int retval; - - veth_file = fopen("/sys/class/net/veth_pairs", "w"); - if (!veth_file) { - VLOG_WARN_RL(&rl, "could not open veth device. Are you running a " - "supported XenServer with the kernel module loaded?"); - return ENODEV; - } - setvbuf(veth_file, NULL, _IONBF, 0); - - va_start(args, format); - retval = vfprintf(veth_file, format, args); - va_end(args); - - fclose(veth_file); - if (retval < 0) { - VLOG_WARN_RL(&rl, "could not destroy patch: %s", strerror(errno)); - return errno; - } - - return 0; -} - -static int -create_patch(const char *name, const char *peer) -{ - int retval; - struct netdev_dev *peer_nd; - - - /* Only create the veth if the peer didn't already do it. */ - peer_nd = netdev_dev_from_name(peer); - if (peer_nd) { - if (!strcmp("patch", netdev_dev_get_type(peer_nd))) { - struct netdev_dev_linux *ndl = netdev_dev_linux_cast(peer_nd); - if (!strcmp(name, ndl->state.patch.peer)) { - return 0; - } else { - VLOG_WARN_RL(&rl, "peer '%s' already paired with '%s'", - peer, ndl->state.patch.peer); - return EINVAL; - } - } else { - VLOG_WARN_RL(&rl, "peer '%s' exists and is not a patch", peer); - return EINVAL; - } - } - - retval = modify_veth("+%s,%s", name, peer); - if (retval) { - return retval; - } - - retval = if_up(name); - if (retval) { - return retval; - } - - retval = if_up(peer); - if (retval) { - return retval; - } - - return 0; -} - -static int -setup_patch(const char *name, const struct shash *args, char **peer_) -{ - const char *peer; - - peer = shash_find_data(args, "peer"); - if (!peer) { - VLOG_WARN("patch type requires valid 'peer' argument"); - return EINVAL; - } - - if (shash_count(args) > 1) { - VLOG_WARN("patch type takes only a 'peer' argument"); - return EINVAL; - } - - if (strlen(peer) >= IFNAMSIZ) { - VLOG_WARN_RL(&rl, "patch 'peer' arg too long"); - return EINVAL; - } - - *peer_ = xstrdup(peer); - return create_patch(name, peer); -} - /* Creates the netdev device of 'type' with 'name'. */ static int netdev_linux_create_system(const char *name, const char *type OVS_UNUSED, @@ -434,28 +322,6 @@ error: return error; } -static int -netdev_linux_create_patch(const char *name, const char *type OVS_UNUSED, - const struct shash *args, struct netdev_dev **netdev_devp) -{ - struct netdev_dev_linux *netdev_dev; - char *peer = NULL; - int error; - - error = setup_patch(name, args, &peer); - if (error) { - free(peer); - return error; - } - - netdev_dev = xzalloc(sizeof *netdev_dev); - netdev_dev->state.patch.peer = peer; - netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_patch_class); - *netdev_devp = &netdev_dev->netdev_dev; - - return 0; -} - static void destroy_tap(struct netdev_dev_linux *netdev_dev) { @@ -466,19 +332,6 @@ destroy_tap(struct netdev_dev_linux *netdev_dev) } } -static void -destroy_patch(struct netdev_dev_linux *netdev_dev) -{ - const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev); - struct patch_state *state = &netdev_dev->state.patch; - - /* Only destroy veth if 'peer' doesn't exist as an existing netdev. */ - if (!netdev_dev_from_name(state->peer)) { - modify_veth("-%s", name); - } - free(state->peer); -} - /* Destroys the netdev device 'netdev_dev_'. */ static void netdev_linux_destroy(struct netdev_dev *netdev_dev_) @@ -494,8 +347,6 @@ netdev_linux_destroy(struct netdev_dev *netdev_dev_) } } else if (!strcmp(type, "tap")) { destroy_tap(netdev_dev); - } else if (!strcmp(type, "patch")) { - destroy_patch(netdev_dev); } free(netdev_dev); @@ -899,6 +750,35 @@ check_for_working_netlink_stats(void) } } +/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */ +static void +netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev) +{ + if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) { + const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev); + const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev); + + netdev_dev->is_tap = !strcmp(type, "tap"); + netdev_dev->is_internal = false; + if (!netdev_dev->is_tap) { + struct ethtool_drvinfo drvinfo; + int error; + + memset(&drvinfo, 0, sizeof drvinfo); + error = netdev_linux_do_ethtool(name, + (struct ethtool_cmd *)&drvinfo, + ETHTOOL_GDRVINFO, + "ETHTOOL_GDRVINFO"); + + if (!error && !strcmp(drvinfo.driver, "openvswitch")) { + netdev_dev->is_internal = true; + } + } + + netdev_dev->cache_valid |= VALID_IS_PSEUDO; + } +} + /* Retrieves current device stats for 'netdev'. * * XXX All of the members of struct netdev_stats are 64 bits wide, but on @@ -916,26 +796,7 @@ netdev_linux_get_stats(const struct netdev *netdev_, COVERAGE_INC(netdev_get_stats); - if (!(netdev_dev->cache_valid & VALID_IS_INTERNAL)) { - netdev_dev->is_internal = !strcmp(netdev_get_type(netdev_), "tap"); - if (!netdev_dev->is_internal) { - struct ethtool_drvinfo drvinfo; - - memset(&drvinfo, 0, sizeof drvinfo); - error = netdev_linux_do_ethtool(netdev_get_name(netdev_), - (struct ethtool_cmd *)&drvinfo, - ETHTOOL_GDRVINFO, - "ETHTOOL_GDRVINFO"); - - if (!error) { - netdev_dev->is_internal = !strcmp(drvinfo.driver, - "openvswitch"); - } - } - - netdev_dev->cache_valid |= VALID_IS_INTERNAL; - } - + netdev_linux_update_is_pseudo(netdev_dev); if (netdev_dev->is_internal) { collect_stats = &raw_stats; } @@ -958,7 +819,7 @@ netdev_linux_get_stats(const struct netdev *netdev_, * will appear to be swapped relative to the other ports since we are the * one sending the data, not a remote computer. For consistency, we swap * them back here. */ - if (!error && netdev_dev->is_internal) { + if (!error && (netdev_dev->is_internal || netdev_dev->is_tap)) { stats->rx_packets = raw_stats.tx_packets; stats->tx_packets = raw_stats.rx_packets; stats->rx_bytes = raw_stats.tx_bytes; @@ -985,6 +846,41 @@ netdev_linux_get_stats(const struct netdev *netdev_, return error; } +static int +netdev_linux_set_stats(struct netdev *netdev, + const struct netdev_stats *stats) +{ + struct netdev_dev_linux *netdev_dev = + netdev_dev_linux_cast(netdev_get_dev(netdev)); + struct internal_dev_stats dp_dev_stats; + struct ifreq ifr; + + /* We must reject this call if 'netdev' is not an Open vSwitch internal + * port, because the ioctl that we are about to execute is in the "device + * private ioctls" range, which means that executing it on a device that + * is not the type we expect could do any random thing. + * + * (Amusingly, these ioctl numbers are commented "THESE IOCTLS ARE + * _DEPRECATED_ AND WILL DISAPPEAR IN 2.5.X" in linux/sockios.h. I guess + * DaveM is a little behind on that.) */ + netdev_linux_update_is_pseudo(netdev_dev); + if (!netdev_dev->is_internal) { + return EOPNOTSUPP; + } + + /* This actually only sets the *offset* that the dp_dev applies, but in our + * usage for fake bond devices the dp_dev never has any traffic of it own + * so it has the same effect. */ + dp_dev_stats.rx_packets = stats->rx_packets; + dp_dev_stats.rx_bytes = stats->rx_bytes; + dp_dev_stats.tx_packets = stats->tx_packets; + dp_dev_stats.tx_bytes = stats->tx_bytes; + ifr.ifr_data = (void *) &dp_dev_stats; + return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, + INTERNAL_DEV_SET_STATS, + "INTERNAL_DEV_SET_STATS"); +} + /* Stores the features supported by 'netdev' into each of '*current', * '*advertised', '*supported', and '*peer' that are non-null. Each value is a * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if @@ -1222,35 +1118,88 @@ done: #define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress" #define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1" -/* We redirect stderr to /dev/null because we often want to remove all - * traffic control configuration on a port so its in a known state. If - * this done when there is no such configuration, tc complains, so we just - * always ignore it. + +/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a + * positive errno value. + * + * This function is equivalent to running + * /sbin/tc qdisc del dev %s handle ffff: ingress + * but it is much, much faster. */ -#define POLICE_DEL_CMD "/sbin/tc qdisc del dev %s handle ffff: ingress 2>/dev/null" +static int +netdev_linux_remove_policing(struct netdev *netdev) +{ + struct netdev_dev_linux *netdev_dev = + netdev_dev_linux_cast(netdev_get_dev(netdev)); + const char *netdev_name = netdev_get_name(netdev); + + struct ofpbuf request; + struct ofpbuf *reply; + struct tcmsg *tcmsg; + struct nl_sock *rtnl_sock; + int ifindex; + int error; + + error = get_ifindex(netdev, &ifindex); + if (error) { + return error; + } + + error = get_rtnl_sock(&rtnl_sock); + if (error) { + return error; + } + + ofpbuf_init(&request, 0); + nl_msg_put_nlmsghdr(&request, rtnl_sock, sizeof *tcmsg, + RTM_DELQDISC, NLM_F_REQUEST); + tcmsg = ofpbuf_put_zeros(&request, sizeof *tcmsg); + tcmsg->tcm_family = AF_UNSPEC; + tcmsg->tcm_ifindex = ifindex; + tcmsg->tcm_handle = 0xffff0000; + tcmsg->tcm_parent = TC_H_INGRESS; + nl_msg_put_string(&request, TCA_KIND, "ingress"); + nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0); + error = nl_sock_transact(rtnl_sock, &request, &reply); + ofpbuf_uninit(&request); + ofpbuf_delete(reply); + if (error && error != ENOENT && error != EINVAL) { + VLOG_WARN_RL(&rl, "%s: removing policing failed: %s", + netdev_name, strerror(error)); + return error; + } + + netdev_dev->kbits_rate = 0; + netdev_dev->kbits_burst = 0; + netdev_dev->cache_valid |= VALID_POLICING; + return 0; +} /* Attempts to set input rate limiting (policing) policy. */ static int netdev_linux_set_policing(struct netdev *netdev, uint32_t kbits_rate, uint32_t kbits_burst) { + struct netdev_dev_linux *netdev_dev = + netdev_dev_linux_cast(netdev_get_dev(netdev)); const char *netdev_name = netdev_get_name(netdev); char command[1024]; COVERAGE_INC(netdev_set_policing); - if (kbits_rate) { - if (!kbits_burst) { - /* Default to 1000 kilobits if not specified. */ - kbits_burst = 1000; - } - /* xxx This should be more careful about only adding if it - * xxx actually exists, as opposed to always deleting it. */ - snprintf(command, sizeof(command), POLICE_DEL_CMD, netdev_name); - if (system(command) == -1) { - VLOG_WARN_RL(&rl, "%s: problem removing policing", netdev_name); - } + kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */ + : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */ + : kbits_burst); /* Stick with user-specified value. */ + if (netdev_dev->cache_valid & VALID_POLICING + && netdev_dev->kbits_rate == kbits_rate + && netdev_dev->kbits_burst == kbits_burst) { + /* Assume that settings haven't changed since we last set them. */ + return 0; + } + + netdev_linux_remove_policing(netdev); + if (kbits_rate) { snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name); if (system(command) != 0) { VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name); @@ -1264,11 +1213,10 @@ netdev_linux_set_policing(struct netdev *netdev, netdev_name); return -1; } - } else { - snprintf(command, sizeof(command), POLICE_DEL_CMD, netdev_name); - if (system(command) == -1) { - VLOG_WARN_RL(&rl, "%s: problem removing policing", netdev_name); - } + + netdev_dev->kbits_rate = kbits_rate; + netdev_dev->kbits_burst = kbits_burst; + netdev_dev->cache_valid |= VALID_POLICING; } return 0; @@ -1668,6 +1616,7 @@ const struct netdev_class netdev_linux_class = { netdev_linux_get_ifindex, netdev_linux_get_carrier, netdev_linux_get_stats, + netdev_linux_set_stats, netdev_linux_get_features, netdev_linux_set_advertisements, @@ -1716,54 +1665,7 @@ const struct netdev_class netdev_tap_class = { netdev_linux_get_ifindex, netdev_linux_get_carrier, netdev_linux_get_stats, - - netdev_linux_get_features, - netdev_linux_set_advertisements, - netdev_linux_get_vlan_vid, - netdev_linux_set_policing, - - netdev_linux_get_in4, - netdev_linux_set_in4, - netdev_linux_get_in6, - netdev_linux_add_router, - netdev_linux_get_next_hop, - netdev_linux_arp_lookup, - - netdev_linux_update_flags, - - netdev_linux_poll_add, - netdev_linux_poll_remove, -}; - -const struct netdev_class netdev_patch_class = { - "patch", - - netdev_linux_init, - netdev_linux_run, - netdev_linux_wait, - - netdev_linux_create_patch, - netdev_linux_destroy, - NULL, /* reconfigure */ - - netdev_linux_open, - netdev_linux_close, - - NULL, /* enumerate */ - - netdev_linux_recv, - netdev_linux_recv_wait, - netdev_linux_drain, - - netdev_linux_send, - netdev_linux_send_wait, - - netdev_linux_set_etheraddr, - netdev_linux_get_etheraddr, - netdev_linux_get_mtu, - netdev_linux_get_ifindex, - netdev_linux_get_carrier, - netdev_linux_get_stats, + NULL, /* set_stats */ netdev_linux_get_features, netdev_linux_set_advertisements, @@ -1797,8 +1699,7 @@ get_stats_via_netlink(int ifindex, struct netdev_stats *stats) .min_len = sizeof(struct rtnl_link_stats) }, }; - - static struct nl_sock *rtnl_sock; + struct nl_sock *rtnl_sock; struct ofpbuf request; struct ofpbuf *reply; struct ifinfomsg *ifi; @@ -1806,13 +1707,9 @@ get_stats_via_netlink(int ifindex, struct netdev_stats *stats) struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)]; int error; - if (!rtnl_sock) { - error = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock); - if (error) { - VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s", - strerror(error)); - return error; - } + error = get_rtnl_sock(&rtnl_sock); + if (error) { + return error; } ofpbuf_init(&request, 0); @@ -2076,3 +1973,26 @@ netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip, } return error; } + +/* Obtains a Netlink routing socket that is not subscribed to any multicast + * groups. Returns 0 if successful, otherwise a positive errno value. Stores + * the socket in '*rtnl_sockp' if successful, otherwise a null pointer. */ +static int +get_rtnl_sock(struct nl_sock **rtnl_sockp) +{ + static struct nl_sock *sock; + int error; + + if (!sock) { + error = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &sock); + if (error) { + VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s", + strerror(error)); + } + } else { + error = 0; + } + + *rtnl_sockp = sock; + return error; +}