1 /* Copyright (c) 2008, 2009 Nicira Networks
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
24 #include <linux/genetlink.h>
25 #include <linux/rtnetlink.h>
29 #include <sys/types.h>
35 #include "command-line.h"
40 #include "fatal-signal.h"
42 #include "leak-checker.h"
46 #include "openvswitch/brcompat-netlink.h"
47 #include "poll-loop.h"
56 #define THIS_MODULE VLM_brcompatd
59 /* xxx Just hangs if datapath is rmmod/insmod. Learn to reconnect? */
61 /* Actions to modify bridge compatibility configuration. */
69 static void parse_options(int argc, char *argv[]);
70 static void usage(void) NO_RETURN;
72 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 60);
74 /* Maximum number of milliseconds to wait for the config file to be
75 * unlocked. If set to zero, no waiting will occur. */
76 static int lock_timeout = 500;
78 /* Maximum number of milliseconds to wait before pruning port entries that
79 * no longer exist. If set to zero, ports are never pruned. */
80 static int prune_timeout = 5000;
82 /* Config file shared with ovs-vswitchd (usually ovs-vswitchd.conf). */
83 static char *config_file;
85 /* Command to run (via system()) to reload the ovs-vswitchd configuration
87 static char *reload_command;
89 /* Netlink socket to listen for interface changes. */
90 static struct nl_sock *rtnl_sock;
92 /* Netlink socket to bridge compatibility kernel module. */
93 static struct nl_sock *brc_sock;
95 /* The Generic Netlink family number used for bridge compatibility. */
96 static int brc_family;
98 static const struct nl_policy brc_multicast_policy[] = {
99 [BRC_GENL_A_MC_GROUP] = {.type = NL_A_U32 }
102 static const struct nl_policy rtnlgrp_link_policy[] = {
103 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
104 [IFLA_MASTER] = { .type = NL_A_U32, .optional = true },
108 lookup_brc_multicast_group(int *multicast_group)
110 struct nl_sock *sock;
111 struct ofpbuf request, *reply;
112 struct nlattr *attrs[ARRAY_SIZE(brc_multicast_policy)];
115 retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &sock);
119 ofpbuf_init(&request, 0);
120 nl_msg_put_genlmsghdr(&request, sock, 0, brc_family,
121 NLM_F_REQUEST, BRC_GENL_C_QUERY_MC, 1);
122 retval = nl_sock_transact(sock, &request, &reply);
123 ofpbuf_uninit(&request);
125 nl_sock_destroy(sock);
128 if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
129 brc_multicast_policy, attrs,
130 ARRAY_SIZE(brc_multicast_policy))) {
131 nl_sock_destroy(sock);
132 ofpbuf_delete(reply);
135 *multicast_group = nl_attr_get_u32(attrs[BRC_GENL_A_MC_GROUP]);
136 nl_sock_destroy(sock);
137 ofpbuf_delete(reply);
142 /* Opens a socket for brcompat notifications. Returns 0 if successful,
143 * otherwise a positive errno value. */
145 brc_open(struct nl_sock **sock)
147 int multicast_group = 0;
150 retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family);
155 retval = lookup_brc_multicast_group(&multicast_group);
160 retval = nl_sock_create(NETLINK_GENERIC, multicast_group, 0, 0, sock);
168 static const struct nl_policy brc_dp_policy[] = {
169 [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
173 bridge_exists(const char *name)
175 return cfg_has_section("bridge.%s", name);
179 rewrite_and_reload_config(void)
181 if (cfg_is_dirty()) {
182 int error1 = cfg_write();
183 int error2 = cfg_read();
184 long long int reload_start = time_msec();
185 int error3 = system(reload_command);
186 long long int elapsed = time_msec() - reload_start;
187 COVERAGE_INC(brcompatd_reload);
189 VLOG_INFO("reload command executed in %lld ms", elapsed);
192 VLOG_ERR("failed to execute reload command: %s", strerror(errno));
193 } else if (error3 != 0) {
194 char *msg = process_status_msg(error3);
195 VLOG_ERR("reload command exited with error (%s)", msg);
198 return error1 ? error1 : error2 ? error2 : error3 ? ECHILD : 0;
203 /* Get all the interfaces for 'bridge' as 'ifaces', breaking bonded interfaces
204 * down into their constituent parts. */
206 get_bridge_ifaces(const char *bridge, struct svec *ifaces)
213 cfg_get_all_keys(&ports, "bridge.%s.port", bridge);
214 for (i = 0; i < ports.n; i++) {
215 const char *port_name = ports.names[i];
216 if (cfg_has_section("bonding.%s", port_name)) {
219 cfg_get_all_keys(&slaves, "bonding.%s.slave", port_name);
220 svec_append(ifaces, &slaves);
221 svec_destroy(&slaves);
223 svec_add(ifaces, port_name);
226 svec_destroy(&ports);
229 /* Go through the configuration file and remove any ports that no longer
230 * exist associated with a bridge. */
236 struct svec bridges, delete;
238 if (cfg_lock(NULL, 0)) {
239 /* Couldn't lock config file. */
245 cfg_get_subsections(&bridges, "bridge");
246 for (i=0; i<bridges.n; i++) {
247 const char *br_name = bridges.names[i];
250 /* Check that each bridge interface exists. */
251 get_bridge_ifaces(br_name, &ifaces);
252 for (j = 0; j < ifaces.n; j++) {
253 const char *iface_name = ifaces.names[j];
254 enum netdev_flags flags;
256 /* The local port and internal ports are created and destroyed by
257 * ovs-vswitchd itself, so don't bother checking for them at all.
258 * In practice, they might not exist if ovs-vswitchd hasn't
259 * finished reloading since the configuration file was updated. */
260 if (!strcmp(iface_name, br_name)
261 || cfg_get_bool(0, "iface.%s.internal", iface_name)) {
265 error = netdev_nodev_get_flags(iface_name, &flags);
266 if (error == ENODEV) {
267 VLOG_INFO_RL(&rl, "removing dead interface %s from %s",
268 iface_name, br_name);
269 svec_add(&delete, iface_name);
271 VLOG_INFO_RL(&rl, "unknown error %d on interface %s from %s",
272 error, iface_name, br_name);
275 svec_destroy(&ifaces);
277 svec_destroy(&bridges);
282 for (i = 0; i < delete.n; i++) {
283 cfg_del_match("bridge.*.port=%s", delete.names[i]);
284 cfg_del_match("bonding.*.slave=%s", delete.names[i]);
286 rewrite_and_reload_config();
291 svec_destroy(&delete);
295 /* Checks whether a network device named 'name' exists and returns true if so,
298 * XXX it is possible that this doesn't entirely accomplish what we want in
299 * context, since ovs-vswitchd.conf may cause vswitchd to create or destroy
300 * network devices based on iface.*.internal settings.
302 * XXX may want to move this to lib/netdev.
304 * XXX why not just use netdev_nodev_get_flags() or similar function? */
306 netdev_exists(const char *name)
312 filename = xasprintf("/sys/class/net/%s", name);
313 error = stat(filename, &s);
319 add_bridge(const char *br_name)
321 if (bridge_exists(br_name)) {
322 VLOG_WARN("addbr %s: bridge %s exists", br_name, br_name);
324 } else if (netdev_exists(br_name)) {
325 if (cfg_get_bool(0, "iface.%s.fake-bridge", br_name)) {
326 VLOG_WARN("addbr %s: %s exists as a fake bridge",
330 VLOG_WARN("addbr %s: cannot create bridge %s because a network "
331 "device named %s already exists",
332 br_name, br_name, br_name);
337 cfg_add_entry("bridge.%s.port=%s", br_name, br_name);
338 VLOG_INFO("addbr %s: success", br_name);
344 del_bridge(const char *br_name)
346 if (!bridge_exists(br_name)) {
347 VLOG_WARN("delbr %s: no bridge named %s", br_name, br_name);
351 cfg_del_section("bridge.%s", br_name);
352 VLOG_INFO("delbr %s: success", br_name);
358 parse_command(struct ofpbuf *buffer, uint32_t *seq, const char **br_name,
359 const char **port_name)
361 static const struct nl_policy policy[] = {
362 [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
363 [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING, .optional = true },
365 struct nlattr *attrs[ARRAY_SIZE(policy)];
367 if (!nl_policy_parse(buffer, NLMSG_HDRLEN + GENL_HDRLEN, policy,
368 attrs, ARRAY_SIZE(policy))
369 || (port_name && !attrs[BRC_GENL_A_PORT_NAME])) {
373 *seq = ((struct nlmsghdr *) buffer->data)->nlmsg_seq;
374 *br_name = nl_attr_get_string(attrs[BRC_GENL_A_DP_NAME]);
376 *port_name = nl_attr_get_string(attrs[BRC_GENL_A_PORT_NAME]);
382 send_reply(uint32_t seq, int error)
388 ofpbuf_init(&msg, 0);
389 nl_msg_put_genlmsghdr(&msg, brc_sock, 32, brc_family, NLM_F_REQUEST,
390 BRC_GENL_C_DP_RESULT, 1);
391 ((struct nlmsghdr *) msg.data)->nlmsg_seq = seq;
392 nl_msg_put_u32(&msg, BRC_GENL_A_ERR_CODE, error);
395 retval = nl_sock_send(brc_sock, &msg, false);
397 VLOG_WARN_RL(&rl, "replying to brcompat request: %s",
404 handle_bridge_cmd(struct ofpbuf *buffer, bool add)
410 error = parse_command(buffer, &seq, &br_name, NULL);
412 error = add ? add_bridge(br_name) : del_bridge(br_name);
414 error = rewrite_and_reload_config();
416 send_reply(seq, error);
421 static const struct nl_policy brc_port_policy[] = {
422 [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
423 [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING },
427 del_port(const char *br_name, const char *port_name)
429 cfg_del_entry("bridge.%s.port=%s", br_name, port_name);
430 cfg_del_match("bonding.*.slave=%s", port_name);
431 cfg_del_match("vlan.%s.*", port_name);
435 handle_port_cmd(struct ofpbuf *buffer, bool add)
437 const char *cmd_name = add ? "add-if" : "del-if";
438 const char *br_name, *port_name;
442 error = parse_command(buffer, &seq, &br_name, &port_name);
444 if (!bridge_exists(br_name)) {
445 VLOG_WARN("%s %s %s: no bridge named %s",
446 cmd_name, br_name, port_name, br_name);
448 } else if (!netdev_exists(port_name)) {
449 VLOG_WARN("%s %s %s: no network device named %s",
450 cmd_name, br_name, port_name, port_name);
454 cfg_add_entry("bridge.%s.port=%s", br_name, port_name);
456 del_port(br_name, port_name);
458 VLOG_INFO("%s %s %s: success", cmd_name, br_name, port_name);
459 error = rewrite_and_reload_config();
461 send_reply(seq, error);
468 brc_recv_update(void)
471 struct ofpbuf *buffer;
472 struct genlmsghdr *genlmsghdr;
477 ofpbuf_delete(buffer);
478 retval = nl_sock_recv(brc_sock, &buffer, false);
479 } while (retval == ENOBUFS
481 && (nl_msg_nlmsgerr(buffer, NULL)
482 || nl_msg_nlmsghdr(buffer)->nlmsg_type == NLMSG_DONE)));
484 if (retval != EAGAIN) {
485 VLOG_WARN_RL(&rl, "brc_recv_update: %s", strerror(retval));
490 genlmsghdr = nl_msg_genlmsghdr(buffer);
492 VLOG_WARN_RL(&rl, "received packet too short for generic NetLink");
496 if (nl_msg_nlmsghdr(buffer)->nlmsg_type != brc_family) {
497 VLOG_DBG_RL(&rl, "received type (%"PRIu16") != brcompat family (%d)",
498 nl_msg_nlmsghdr(buffer)->nlmsg_type, brc_family);
502 if (cfg_lock(NULL, lock_timeout)) {
503 /* Couldn't lock config file. */
508 switch (genlmsghdr->cmd) {
509 case BRC_GENL_C_DP_ADD:
510 retval = handle_bridge_cmd(buffer, true);
513 case BRC_GENL_C_DP_DEL:
514 retval = handle_bridge_cmd(buffer, false);
517 case BRC_GENL_C_PORT_ADD:
518 retval = handle_port_cmd(buffer, true);
521 case BRC_GENL_C_PORT_DEL:
522 retval = handle_port_cmd(buffer, false);
532 ofpbuf_delete(buffer);
536 /* Check for interface configuration changes announced through RTNL. */
538 rtnl_recv_update(void)
542 int error = nl_sock_recv(rtnl_sock, &buf, false);
543 if (error == EAGAIN) {
545 } else if (error == ENOBUFS) {
546 VLOG_WARN_RL(&rl, "network monitor socket overflowed");
548 VLOG_WARN_RL(&rl, "error on network monitor socket: %s",
551 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
552 struct nlmsghdr *nlh;
553 struct ifinfomsg *iim;
555 nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN);
556 iim = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *iim);
558 VLOG_WARN_RL(&rl, "received bad rtnl message (no ifinfomsg)");
563 if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
565 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
566 VLOG_WARN_RL(&rl,"received bad rtnl message (policy)");
570 if (nlh->nlmsg_type == RTM_DELLINK && attrs[IFLA_MASTER]) {
571 const char *port_name = nl_attr_get_string(attrs[IFLA_IFNAME]);
572 char br_name[IFNAMSIZ];
573 uint32_t br_idx = nl_attr_get_u32(attrs[IFLA_MASTER]);
575 enum netdev_flags flags;
577 if (!if_indextoname(br_idx, br_name)) {
582 if (cfg_lock(NULL, lock_timeout)) {
583 /* Couldn't lock config file. */
584 /* xxx this should try again and print error msg. */
589 if (netdev_nodev_get_flags(port_name, &flags) == ENODEV) {
590 /* Network device is really gone. */
591 VLOG_INFO("network device %s destroyed, "
592 "removing from bridge %s", port_name, br_name);
594 cfg_get_all_keys(&ports, "bridge.%s.port", br_name);
596 if (svec_contains(&ports, port_name)) {
597 del_port(br_name, port_name);
598 rewrite_and_reload_config();
601 /* A network device by that name exists even though the kernel
602 * told us it had disappeared. Probably, what happened was
605 * 1. Device destroyed.
606 * 2. Notification sent to us.
607 * 3. New device created with same name as old one.
608 * 4. ovs-brcompatd notified, removes device from bridge.
610 * There's no a priori reason that in this situation that the
611 * new device with the same name should remain in the bridge;
612 * on the contrary, that would be unexpected. *But* there is
613 * one important situation where, if we do this, bad things
614 * happen. This is the case of XenServer Tools version 5.0.0,
615 * which on boot of a Windows VM cause something like this to
616 * happen on the Xen host:
618 * i. Create tap1.0 and vif1.0.
620 * iii. Delete vif1.0.
621 * iv. Re-create vif1.0.
623 * (XenServer Tools 5.5.0 does not exhibit this behavior, and
624 * neither does a VM without Tools installed at all.@.)
626 * Steps iii and iv happen within a few seconds of each other.
627 * Step iv causes /etc/xensource/scripts/vif to run, which in
628 * turn calls ovs-cfg-mod to add the new device to the bridge.
629 * If step iv happens after step 4 (in our first list of
630 * steps), then all is well, but if it happens between 3 and 4
631 * (which can easily happen if ovs-brcompatd has to wait to
632 * lock the configuration file), then we will remove the new
633 * incarnation from the bridge instead of the old one!
635 * So, to avoid this problem, we do nothing here. This is
636 * strictly incorrect except for this one particular case, and
637 * perhaps that will bite us someday. If that happens, then we
638 * will have to somehow track network devices by ifindex, since
639 * a new device will have a new ifindex even if it has the same
640 * name as an old device.
642 VLOG_INFO("kernel reported network device %s removed but "
643 "a device by that name exists (XS Tools 5.0.0?)",
653 main(int argc, char *argv[])
655 struct unixctl_server *unixctl;
658 set_program_name(argv[0]);
659 register_fault_handlers();
662 parse_options(argc, argv);
663 signal(SIGPIPE, SIG_IGN);
666 die_if_already_running();
669 retval = unixctl_server_create(NULL, &unixctl);
671 ovs_fatal(retval, "could not listen for vlog connections");
674 if (brc_open(&brc_sock)) {
675 ovs_fatal(0, "could not open brcompat socket. Check "
676 "\"brcompat\" kernel module.");
680 if (nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &rtnl_sock)) {
681 ovs_fatal(0, "could not create rtnetlink socket");
688 unixctl_server_run(unixctl);
691 /* If 'prune_timeout' is non-zero, we actively prune from the
692 * config file any 'bridge.<br_name>.port' entries that are no
693 * longer valid. We use two methods:
695 * 1) The kernel explicitly notifies us of removed ports
696 * through the RTNL messages.
698 * 2) We periodically check all ports associated with bridges
699 * to see if they no longer exist.
705 nl_sock_wait(rtnl_sock, POLLIN);
706 poll_timer_wait(prune_timeout);
709 nl_sock_wait(brc_sock, POLLIN);
710 unixctl_server_wait(unixctl);
718 parse_options(int argc, char *argv[])
721 OPT_LOCK_TIMEOUT = UCHAR_MAX + 1,
725 LEAK_CHECKER_OPTION_ENUMS
727 static struct option long_options[] = {
728 {"help", no_argument, 0, 'h'},
729 {"version", no_argument, 0, 'V'},
730 {"lock-timeout", required_argument, 0, OPT_LOCK_TIMEOUT},
731 {"prune-timeout", required_argument, 0, OPT_PRUNE_TIMEOUT},
732 {"reload-command", required_argument, 0, OPT_RELOAD_COMMAND},
735 LEAK_CHECKER_LONG_OPTIONS,
738 char *short_options = long_options_to_short_options(long_options);
741 reload_command = xasprintf("%s/ovs-appctl -t "
742 "%s/ovs-vswitchd.`cat %s/ovs-vswitchd.pid`.ctl "
743 "-e vswitchd/reload 2>&1 "
744 "| /usr/bin/logger -t brcompatd-reload",
745 ovs_bindir, ovs_rundir, ovs_rundir);
749 c = getopt_long(argc, argv, short_options, long_options, NULL);
760 OVS_PRINT_VERSION(0, 0);
763 case OPT_LOCK_TIMEOUT:
764 lock_timeout = atoi(optarg);
767 case OPT_PRUNE_TIMEOUT:
768 prune_timeout = atoi(optarg) * 1000;
771 case OPT_RELOAD_COMMAND:
772 reload_command = optarg;
776 DAEMON_OPTION_HANDLERS
777 LEAK_CHECKER_OPTION_HANDLERS
792 ovs_fatal(0, "exactly one non-option argument required; "
793 "use --help for usage");
796 config_file = argv[0];
797 error = cfg_set_file(config_file);
799 ovs_fatal(error, "failed to add configuration file \"%s\"",
807 printf("%s: bridge compatibility front-end for ovs-vswitchd\n"
808 "usage: %s [OPTIONS] CONFIG\n"
809 "CONFIG is the configuration file used by ovs-vswitchd.\n",
810 program_name, program_name);
811 printf("\nConfiguration options:\n"
812 " --reload-command=COMMAND shell command to reload ovs-vswitchd\n"
813 " --prune-timeout=SECS wait at most SECS before pruning ports\n"
814 " --lock-timeout=MSECS wait at most MSECS for CONFIG to unlock\n"
818 printf("\nOther options:\n"
819 " -h, --help display this help message\n"
820 " -V, --version display version information\n");
821 leak_checker_usage();
822 printf("\nThe default reload command is:\n%s\n", reload_command);